#Loading in mapping data that will allow us to go from Uniprot ID to gene ID.
# Will use this to fix the labels.
# Next time make sure to have fragpipe give gene level data instead of by uniprot id.
protein_to_gene = readr::read_csv("00_r_input_data/protein_id_to_gene.csv")This figure was generated in Biorender.
### Figure 1b ###
# Reading in the cleaned clinical metadata to start to make plots
clinical_metadata = readr::read_csv("00_r_input_data/clinical_metadata.csv")
p0 = clinical_metadata %>%
ggplot(aes(condition,fill = condition))+
geom_histogram(stat = "count")+
scale_fill_viridis_d()+
ggtitle("Condition")
p0# Defining the columns that we want to show patient charachteristics for
columns_to_look_at = c("gender", "charleson_index", "bacteremia_duration","gender","day_of_blood_draw","sensitive_to_vancomycin","condition","death_during_admission")
# Filtering the data to the relevant columns
sel = clinical_metadata %>%
select(sample_id,columns_to_look_at,pathogen) %>%
filter(!is.na(pathogen))
# Plotting Gender by Pathogen
p1 = sel %>%
ggplot(aes(gender,fill = pathogen))+
geom_histogram(stat = "count")+
scale_fill_viridis_d(end = 0.5)+
ggtitle("Gender")
#Plotting Charleson index by Pathogen
p2 = sel %>%
ggplot(aes(charleson_index,fill = pathogen))+
geom_histogram(stat = "count")+
scale_fill_viridis_d(end = 0.5)+
ggtitle("Charleson Index")
# Bacteremia Durration
p3 = sel %>%
ggplot(aes(bacteremia_duration,fill = pathogen))+
geom_histogram(stat = "count")+
scale_fill_viridis_d(end = 0.5)+
ggtitle("Bacteremia Duration")
# Day of Blood Draw
p4 = sel %>%
ggplot(aes(day_of_blood_draw,fill = pathogen))+
geom_histogram(stat = "count")+
scale_fill_viridis_d(end = 0.5)+
ggtitle("Day of Blood Draw")
# Sensitivity to Vancomycin
p5 = sel %>%
ggplot(aes(sensitive_to_vancomycin,fill = pathogen))+
geom_histogram(stat = "count")+
scale_fill_viridis_d(end = 0.5)+
ggtitle("Sensitivity to Vancomycin")
# Sensitivity to Vancomycin
p6 = sel %>%
ggplot(aes(death_during_admission,fill = pathogen))+
geom_histogram(stat = "count")+
scale_fill_viridis_d(end = 0.5)+
ggtitle("Death During Admission")
f1b = ggpubr::ggarrange(p1,p2,p3,p4,p5,p6,common.legend = TRUE,legend = "none",labels = c("i","ii","iii","iv","v","vi"))
f1bggsave("02_figures/f1_overview_of_study/f1b.pdf",plot = f1b, height = 10, width = 10,units = "in")### Figure 1C ###
#Loading in mapping file
mapping = readxl::read_excel("../proteomics/sample_mapping.xlsx")
#Loading in data
data = read_delim("../proteomics/fragpipe_output_fixed_labels/tmt-report/abundance_gene_MD.tsv")
#Converting to long format
long = data %>%
pivot_longer(6:length(.),names_to = "sample_name")
#Combining data and mapping
all = inner_join(mapping,long,by="sample_name")
#Converting into a matrix
mat = all %>%
dplyr::select(-ProteinID,-NumberPSM,-ReferenceIntensity,-MaxPepProb) %>%
pivot_wider(values_from = value,names_from = Index)
#Extracting only the data
dat = mat %>%
dplyr::select(8:length(.)) %>%
dplyr::select_if(~ !any(is.na(.)))
#Extracting only the metadata
md = mat %>%
dplyr::select(1:7)
#setting row names so we can look at the labels if need be
rownames(dat) <- paste0(md$condition,md$sample_name)
# Calculating distances
d = dist(dat)
# Showing as a dendrogram
dend = as.dendrogram(hclust(d, method = "ward.D2"))
library(circlize)
library(dendextend)
# adding color by the type of infection
colors_to_use = viridis::viridis(3)[as.numeric(as.factor(md$condition))]
# getting everything in the correct order
colors_to_use = colors_to_use[order.dendrogram(dend)]
# adding color to the dendrogram branches
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd", 8)
#this is in circular form
pdf("02_figures/f1_overview_of_study/f1c.pdf")
circlize_dendrogram(dend,
labels_track_height = NA,
dend_track_height = 0.5,
labels = FALSE)
dev.off()## png
## 2
### Figure 1 D ###
## Reading in data ##
normalized_data = read_csv("normalized_metabolomics.csv")
##Why do we have different numbers of healthy?
test = normalized_data %>%
select(sample_id,condition) %>%
distinct() %>%
group_by(condition) %>%
summarise(n = n())
## Transforming the data ##
norm_wide = normalized_data %>%
select(-row_m_z,-row_retention_time,-value) %>%
tidyr::pivot_wider(names_from = row_id,values_from = norm_value)
norm_dat = norm_wide %>%
dplyr::select(78:length(.))
norm_md = norm_wide %>%
dplyr::select(1:77)
## Plotting the overal clustering ##
rownames(norm_dat) = md$sample_id
d = dist(norm_dat)
dend = as.dendrogram(hclust(d, method = "ward.D2"))
#dend = as.dendrogram(hclust(d))
library(circlize)
library(dendextend)
# let's add some color:
colors_to_use = viridis::viridis(3)[as.numeric(as.factor(norm_md$condition))]
#getting in the same order as dendrogram
colors_to_use = colors_to_use[order.dendrogram(dend)]
# Now we can color the brancehes
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd", 8)
pdf("02_figures/f1_overview_of_study/f1d.pdf")
circlize_dendrogram(dend,
labels_track_height = NA,
dend_track_height = 0.5,
labels = FALSE)
dev.off()## png
## 2
#reading in the data
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
## Getting into a format ot use with EFS ##
clean = pl %>%
dplyr::group_by(Mixture,Gene,Channel,Condition) %>%
dplyr::summarise(sum = sum(Abundance)) %>%
ungroup()
infected_healthy_l = clean %>%
mutate(classlabels = case_when(Condition %in% c("Faecalis","Faecium") ~ 1,
TRUE ~ 0)) %>%
mutate(classlabels = as.logical(classlabels)) %>%
# - messes up EFS for some reason, very annoying.
mutate(Gene = gsub("-","_",Gene))
infected_healthy_w = infected_healthy_l %>%
tidyr::pivot_wider(names_from = Gene,
values_from = sum) %>%
mutate(sampleid = paste0(Mixture,".",Channel)) %>%
dplyr::select(-Mixture,-Channel,-Condition,-sampleid) %>%
dplyr::select_if(~ !any(is.na(.))) %>%
#needs to be a dataframe or EFS freaks out.
as.data.frame()
set.seed(2)
## Running EFS ##
ensemble_result = EFS::ensemble_fs(infected_healthy_w,1,cor_threshold = 0)## [1] "default value for NA_threshold = 0.2"
## [1] "default value for runs = 100"
## [1] "default value for selection is c(TRUE, TRUE, TRUE,TRUE, TRUE, TRUE, FALSE, FALSE)"
## [1] "Start Median"
## [1] "Start Pearson"
## [1] "Start Spearman"
## [1] "Start LogReg"
## [1] "Start RF"
## [1] 1
## Time difference of 0.2822244 secs
## [1] 2
## Time difference of 0.276001 secs
## [1] 3
## Time difference of 0.27842 secs
## [1] 4
## Time difference of 0.2774577 secs
## [1] 5
## Time difference of 0.2770195 secs
## [1] 6
## Time difference of 0.2779212 secs
## [1] 7
## Time difference of 0.2768254 secs
## [1] 8
## Time difference of 0.2787247 secs
## [1] 9
## Time difference of 0.2777641 secs
## [1] 10
## Time difference of 0.2776346 secs
## [1] 11
## Time difference of 0.2767966 secs
## [1] 12
## Time difference of 0.2763455 secs
## [1] 13
## Time difference of 0.27529 secs
## [1] 14
## Time difference of 0.2789001 secs
## [1] 15
## Time difference of 0.2782972 secs
## [1] 16
## Time difference of 0.2782431 secs
## [1] 17
## Time difference of 0.2764843 secs
## [1] 18
## Time difference of 0.2772787 secs
## [1] 19
## Time difference of 0.2790523 secs
## [1] 20
## Time difference of 0.2790442 secs
## [1] 21
## Time difference of 0.274559 secs
## [1] 22
## Time difference of 0.2789614 secs
## [1] 23
## Time difference of 0.2759447 secs
## [1] 24
## Time difference of 0.2778068 secs
## [1] 25
## Time difference of 0.2759237 secs
## [1] 26
## Time difference of 0.2779393 secs
## [1] 27
## Time difference of 0.2793624 secs
## [1] 28
## Time difference of 0.2774329 secs
## [1] 29
## Time difference of 0.2793982 secs
## [1] 30
## Time difference of 0.3228397 secs
## [1] 31
## Time difference of 0.2887111 secs
## [1] 32
## Time difference of 0.2830222 secs
## [1] 33
## Time difference of 0.2764602 secs
## [1] 34
## Time difference of 0.2788086 secs
## [1] 35
## Time difference of 0.2780724 secs
## [1] 36
## Time difference of 0.2782059 secs
## [1] 37
## Time difference of 0.2767382 secs
## [1] 38
## Time difference of 0.2777145 secs
## [1] 39
## Time difference of 0.2783973 secs
## [1] 40
## Time difference of 0.2780206 secs
## [1] 41
## Time difference of 0.2785213 secs
## [1] 42
## Time difference of 0.2794218 secs
## [1] 43
## Time difference of 0.2788284 secs
## [1] 44
## Time difference of 0.2784224 secs
## [1] 45
## Time difference of 0.2785761 secs
## [1] 46
## Time difference of 0.2769692 secs
## [1] 47
## Time difference of 0.2778337 secs
## [1] 48
## Time difference of 0.2788315 secs
## [1] 49
## Time difference of 0.2774115 secs
## [1] 50
## Time difference of 0.2785761 secs
## [1] 51
## Time difference of 0.2768278 secs
## [1] 52
## Time difference of 0.2771301 secs
## [1] 53
## Time difference of 0.2786734 secs
## [1] 54
## Time difference of 0.2787666 secs
## [1] 55
## Time difference of 0.2775931 secs
## [1] 56
## Time difference of 0.2793219 secs
## [1] 57
## Time difference of 0.2764621 secs
## [1] 58
## Time difference of 0.2771785 secs
## [1] 59
## Time difference of 0.2762299 secs
## [1] 60
## Time difference of 0.2772322 secs
## [1] 61
## Time difference of 0.2768602 secs
## [1] 62
## Time difference of 0.2776904 secs
## [1] 63
## Time difference of 0.2786202 secs
## [1] 64
## Time difference of 0.2775733 secs
## [1] 65
## Time difference of 0.2778137 secs
## [1] 66
## Time difference of 0.2776201 secs
## [1] 67
## Time difference of 0.2780297 secs
## [1] 68
## Time difference of 0.279515 secs
## [1] 69
## Time difference of 0.2768619 secs
## [1] 70
## Time difference of 0.2770848 secs
## [1] 71
## Time difference of 0.2777214 secs
## [1] 72
## Time difference of 0.2786336 secs
## [1] 73
## Time difference of 0.2759225 secs
## [1] 74
## Time difference of 0.2775395 secs
## [1] 75
## Time difference of 0.2769277 secs
## [1] 76
## Time difference of 0.2771947 secs
## [1] 77
## Time difference of 0.2767229 secs
## [1] 78
## Time difference of 0.2783797 secs
## [1] 79
## Time difference of 0.2770898 secs
## [1] 80
## Time difference of 0.2785447 secs
## [1] 81
## Time difference of 0.311022 secs
## [1] 82
## Time difference of 0.2895238 secs
## [1] 83
## Time difference of 0.2880669 secs
## [1] 84
## Time difference of 0.2827408 secs
## [1] 85
## Time difference of 0.2800593 secs
## [1] 86
## Time difference of 0.2771797 secs
## [1] 87
## Time difference of 0.2780092 secs
## [1] 88
## Time difference of 0.2773471 secs
## [1] 89
## Time difference of 0.2771437 secs
## [1] 90
## Time difference of 0.2785864 secs
## [1] 91
## Time difference of 0.2773657 secs
## [1] 92
## Time difference of 0.2773061 secs
## [1] 93
## Time difference of 0.2771416 secs
## [1] 94
## Time difference of 0.2769239 secs
## [1] 95
## Time difference of 0.2760029 secs
## [1] 96
## Time difference of 0.2778893 secs
## [1] 97
## Time difference of 0.280009 secs
## [1] 98
## Time difference of 0.2766047 secs
## [1] 99
## Time difference of 0.2771409 secs
## [1] 100
## Time difference of 0.2787406 secs
## [1] "Build return matrix"
## [1] "Done"
## Time difference of 28.07087 secs
## Getting EFS Result in Plotable Format
long = ensemble_result %>%
as.data.frame() %>%
tibble::rownames_to_column("model") %>%
pivot_longer(2:length(.),names_to = "Gene") %>%
#converting genes back to proper notation.
mutate(Gene = gsub("_","-",Gene))
# Ranking Biomarkers
top = long %>%
group_by(Gene) %>%
dplyr::summarise(sum = sum(value)) %>%
dplyr::arrange(-sum)
# Extracting the top 10 ranked biomarkers
topn = top[1:10,] %>%
pull(Gene)
# Filtering the EFS data for plotting purposes
filtered = long %>%
filter(Gene %in% topn)
# Generating the EFS plot.
f2d = filtered %>%
ggplot(aes(reorder(Gene,value),value,fill = model))+
geom_col()+
coord_flip()+
viridis::scale_fill_viridis(discrete = TRUE)+
ggtitle("Top 10 Performing Protein Biomarkers")+
xlab("Protein Biomarkers")
f2dggsave("02_figures/f2_infection_v_healthy_proteomics/f2d.pdf",f2d,height = 8, width = 8)
# Writting to file to use in supplementary figures.
topn_proteomics = tibble(Gene = topn)
write_csv(topn_proteomics,"01_r_processed_data/top10_infected_v_healthy_proteomics_biomarkers.csv")# Reading in data
protein_sum = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
# Processing data
ROC_data = dplyr::filter(protein_sum,Gene %in% topn) %>%
dplyr::mutate(Condition = case_when(Condition %in% c("Faecalis","Faecium") ~ "infected",
TRUE ~ Condition))
source("functions.R")
f2e = make_roc_proteomics(ROC_data,topn)ggsave("02_figures/f2_infection_v_healthy_proteomics/f2e.pdf",f2e,height = 6, width = 16)### Unnormalized data ###
#reading and formatting the metabolomics data
data = read_csv("00_r_input_data/mzmine_output/features_quant.csv") %>%
janitor::clean_names() %>%
select(1:3,14:length(.)) %>%
pivot_longer(4:length(.),names_to = "metabolomics_id") %>%
mutate(metabolomics_id = gsub("_mz.*","",metabolomics_id),
metabolomics_id = toupper(metabolomics_id)) %>%
#need to add the RID prefix because having numeric column names (which we will have after pivoting the data) messes up lots of the functions.
mutate(row_id = paste0("RID_",row_id))
# Reading in the mapping
map = read_csv("00_r_input_data/metabolomics_mapping.csv")
md = read_csv("00_r_input_data/clinical_metadata.csv")
# combining the mapping and metadata
map_md = inner_join(map,md,by= "sample_id")
# Combining everything
all = inner_join(data,map_md,by = "metabolomics_id")
write_csv(all,"01_r_processed_data/unnormalized_metabolomics.csv")
##### Normalizing the metabolomics data ######
standard = all %>%
filter(row_id == "RID_6043") %>%
select(metabolomics_id,standard_value = value)
#The normalization that Robert used was to divide the value by the standard value, multiply by 1E6 ( so it isn't negative) and then take the log10 of it.
norm = all %>%
inner_join(standard, by = "metabolomics_id") %>%
mutate(norm_value = log10((value/standard_value) * 1E6))
inf = norm %>%
filter(norm_value == -Inf) %>%
select(row_id,value,standard_value,norm_value) %>%
pull(row_id) %>%
unique()
`%!in%` = Negate(`%in%`)
normalized_data = norm %>%
filter(row_id %!in% inf,
row_id != "RID_6043")
write_csv(normalized_data,"01_r_processed_data/normalized_metabolomics.csv")gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv",na = c("N/A","NA"))
## Reading in the normalized metabolomics data ##
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
# Performing statistics so we can make a volcano plot
stat = normalized_data %>%
dplyr::group_by(row_id) %>%
rstatix::t_test(norm_value ~ infection_status) %>%
rstatix::adjust_pvalue(p.col = "p",output.col = "p.adj_fdr", method = "fdr")
# Extracting the values for each feature for healthy patients
healthy = normalized_data %>%
filter(infection_status == "uninfected") %>%
group_by(row_id) %>%
summarise(healthy = mean(norm_value))
# Extracting the values for each feature for infected patients
infected = normalized_data %>%
filter(infection_status == "infected") %>%
group_by(row_id) %>%
summarise(infected = mean(norm_value))
## Combining the healthy and infected so we can caclulated the log2fc ##
log2fc = inner_join(healthy,infected,by = "row_id") %>%
mutate(log2fc_infected_healthy = log2(infected/healthy))
## Joining the statistics and log2 fold change ##
vp = inner_join(stat,log2fc,by = "row_id") %>%
mutate(annotated = case_when(row_id %in% gnps_annotations$row_id ~ TRUE,
TRUE ~ FALSE))
f3a = vp %>%
ggplot(aes(log2fc_infected_healthy,-log10(p.adj_fdr),color = annotated))+
geom_point()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed",color = "red")+
xlab("log2FC (Infected/Healthy)")+
theme(legend.position = "bottom")
f3aggsave(f3a,filename = "02_figures/f3_infection_v_healthy_metabolomics/f3a.pdf",height = 6, width = 8)## Reading in the data ##
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv",na = c("N/A","NA"))
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
unnormalized_data = read_csv("01_r_processed_data/unnormalized_metabolomics.csv")
## Calculating the number of features ##
n_identified_features_gnps = nrow(gnps_annotations)
total_features = length(unique(unnormalized_data$row_id))
features_after_norm = length(unique(normalized_data$row_id))
percent_of_total_features_id = round(n_identified_features_gnps / total_features * 100,)
percent_of_norm_features_id = round(n_identified_features_gnps / features_after_norm * 100,0)
# Putting together a data frame #
feature_count = tibble(`# total features` = total_features,
`# features after normalization` = features_after_norm,
`# of identified features by GNPS` = n_identified_features_gnps,
`% of total features ided` = percent_of_total_features_id,
`% of normalized features ided` = percent_of_norm_features_id) %>%
pivot_longer(1:length(.))
#Making a plot of the table
pdf("02_figures/f3_infection_v_healthy_metabolomics/f3b.pdf",height = 4, width = 4)
gridExtra::grid.table(feature_count)
dev.off()## png
## 2
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
## Structuring the data for enrichment Analysis ##
gnps_enrichment_data = gnps_annotations %>%
#Manual conversion of unreasonable name to what it is listed as in MASSBANK
select(compound_name_cleaned,npclassifier_superclass,npclassifier_class,npclassifier_pathway) %>%
mutate(GO_id = paste0(npclassifier_superclass,"_",npclassifier_class,"_",npclassifier_pathway))
#Making a makeshift "GO" type database using the GNPS annotations.
TERM2GENE = gnps_enrichment_data %>%
dplyr::select(TERM = GO_id,GENE = compound_name_cleaned) %>%
as.data.frame()
TERM2NAME = gnps_enrichment_data %>%
select(-compound_name_cleaned) %>%
pivot_longer(cols = 1:3) %>%
select(keys = GO_id,columns = value) %>%
as.data.frame()
## Extracting the row_ids that were significant from the volcano plot ##
stat_different = vp %>%
filter(p.adj_fdr <= 0.05)
#
#converting RID to compound_name
stat_different_map = inner_join(vp,gnps_annotations,by = "row_id")
## Extracting the row ids that were upregulated in
up = stat_different_map %>%
filter(log2fc_infected_healthy > 0 ) %>%
pull(compound_name_cleaned)
# Performing an enrichment
enriched_in_infected = clusterProfiler::enricher(up,
universe = gnps_enrichment_data$compound_name_cleaned,
TERM2GENE = TERM2GENE,
TERM2NAME = TERM2NAME
)
f3c = clusterProfiler::cnetplot(enriched_in_infected,cex_label_category = 2.5)
f3cggsave("02_figures/f3_infection_v_healthy_metabolomics/f3c.pdf",f3c,height = 9,width = 9)## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
## Switching to the wide format ###
norm_wide = normalized_data %>%
select(-row_m_z,-row_retention_time,-value) %>%
tidyr::pivot_wider(names_from = row_id,values_from = norm_value)
## Looking for only columns that were annotated ##
gnps_and_in_norm = intersect(unique(gnps_annotations$row_id), colnames(norm_wide))
gnps_annotated_metabolites = norm_wide %>%
as.data.frame() %>%
select(condition,gnps_and_in_norm)
## Extracting the numeric data ##
efs_sel = gnps_annotated_metabolites %>%
mutate(classlabels = case_when(condition %in% c("faecalis","faecium") ~ 0,
TRUE ~ 1)) %>%
select(classlabels,2:length(.)) %>%
#absolutely needs to be a data frame and not a tibble for EFS package
as.data.frame()
# Seting seed incase this isn't done internally in EFS
set.seed(2)
## Running EFS feature selection ##
ensemble_result = EFS::ensemble_fs(efs_sel,
classnumber = 1,cor_threshold = 0)## [1] "default value for NA_threshold = 0.2"
## [1] "default value for runs = 100"
## [1] "default value for selection is c(TRUE, TRUE, TRUE,TRUE, TRUE, TRUE, FALSE, FALSE)"
## [1] "Start Median"
## [1] "Start Pearson"
## [1] "Start Spearman"
## [1] "Start LogReg"
## [1] "Start RF"
## [1] 1
## Time difference of 0.2082546 secs
## [1] 2
## Time difference of 0.2078691 secs
## [1] 3
## Time difference of 0.2080035 secs
## [1] 4
## Time difference of 0.2069833 secs
## [1] 5
## Time difference of 0.2074885 secs
## [1] 6
## Time difference of 0.2074018 secs
## [1] 7
## Time difference of 0.2059116 secs
## [1] 8
## Time difference of 0.2061393 secs
## [1] 9
## Time difference of 0.2083662 secs
## [1] 10
## Time difference of 0.2077384 secs
## [1] 11
## Time difference of 0.2069712 secs
## [1] 12
## Time difference of 0.2073445 secs
## [1] 13
## Time difference of 0.2066495 secs
## [1] 14
## Time difference of 0.206538 secs
## [1] 15
## Time difference of 0.2385371 secs
## [1] 16
## Time difference of 0.2072957 secs
## [1] 17
## Time difference of 0.2052722 secs
## [1] 18
## Time difference of 0.2057483 secs
## [1] 19
## Time difference of 0.2084563 secs
## [1] 20
## Time difference of 0.2062871 secs
## [1] 21
## Time difference of 0.2070186 secs
## [1] 22
## Time difference of 0.2074714 secs
## [1] 23
## Time difference of 0.2072351 secs
## [1] 24
## Time difference of 0.2071924 secs
## [1] 25
## Time difference of 0.206424 secs
## [1] 26
## Time difference of 0.2077479 secs
## [1] 27
## Time difference of 0.2067168 secs
## [1] 28
## Time difference of 0.2072783 secs
## [1] 29
## Time difference of 0.2070918 secs
## [1] 30
## Time difference of 0.2059109 secs
## [1] 31
## Time difference of 0.2076771 secs
## [1] 32
## Time difference of 0.2072468 secs
## [1] 33
## Time difference of 0.2062991 secs
## [1] 34
## Time difference of 0.2071722 secs
## [1] 35
## Time difference of 0.2067716 secs
## [1] 36
## Time difference of 0.205621 secs
## [1] 37
## Time difference of 0.2062902 secs
## [1] 38
## Time difference of 0.206733 secs
## [1] 39
## Time difference of 0.2067428 secs
## [1] 40
## Time difference of 0.2060871 secs
## [1] 41
## Time difference of 0.2074089 secs
## [1] 42
## Time difference of 0.208744 secs
## [1] 43
## Time difference of 0.2061419 secs
## [1] 44
## Time difference of 0.2059016 secs
## [1] 45
## Time difference of 0.2073395 secs
## [1] 46
## Time difference of 0.208194 secs
## [1] 47
## Time difference of 0.2064514 secs
## [1] 48
## Time difference of 0.2066739 secs
## [1] 49
## Time difference of 0.208406 secs
## [1] 50
## Time difference of 0.2052522 secs
## [1] 51
## Time difference of 0.2054951 secs
## [1] 52
## Time difference of 0.206871 secs
## [1] 53
## Time difference of 0.2071319 secs
## [1] 54
## Time difference of 0.2072175 secs
## [1] 55
## Time difference of 0.207309 secs
## [1] 56
## Time difference of 0.2076159 secs
## [1] 57
## Time difference of 0.2089806 secs
## [1] 58
## Time difference of 0.207855 secs
## [1] 59
## Time difference of 0.2057042 secs
## [1] 60
## Time difference of 0.2083879 secs
## [1] 61
## Time difference of 0.2081385 secs
## [1] 62
## Time difference of 0.2093251 secs
## [1] 63
## Time difference of 0.2090251 secs
## [1] 64
## Time difference of 0.2074213 secs
## [1] 65
## Time difference of 0.2074411 secs
## [1] 66
## Time difference of 0.2081702 secs
## [1] 67
## Time difference of 0.2094545 secs
## [1] 68
## Time difference of 0.2084265 secs
## [1] 69
## Time difference of 0.2085903 secs
## [1] 70
## Time difference of 0.208122 secs
## [1] 71
## Time difference of 0.2071164 secs
## [1] 72
## Time difference of 0.2090859 secs
## [1] 73
## Time difference of 0.2097008 secs
## [1] 74
## Time difference of 0.210088 secs
## [1] 75
## Time difference of 0.2093132 secs
## [1] 76
## Time difference of 0.2073915 secs
## [1] 77
## Time difference of 0.2088795 secs
## [1] 78
## Time difference of 0.209394 secs
## [1] 79
## Time difference of 0.2094882 secs
## [1] 80
## Time difference of 0.2084107 secs
## [1] 81
## Time difference of 0.2078941 secs
## [1] 82
## Time difference of 0.2081702 secs
## [1] 83
## Time difference of 0.2091987 secs
## [1] 84
## Time difference of 0.2087274 secs
## [1] 85
## Time difference of 0.2083836 secs
## [1] 86
## Time difference of 0.2083969 secs
## [1] 87
## Time difference of 0.236011 secs
## [1] 88
## Time difference of 0.2065244 secs
## [1] 89
## Time difference of 0.2064991 secs
## [1] 90
## Time difference of 0.2073836 secs
## [1] 91
## Time difference of 0.2056842 secs
## [1] 92
## Time difference of 0.2048037 secs
## [1] 93
## Time difference of 0.207058 secs
## [1] 94
## Time difference of 0.2065823 secs
## [1] 95
## Time difference of 0.2074566 secs
## [1] 96
## Time difference of 0.2086294 secs
## [1] 97
## Time difference of 0.2070882 secs
## [1] 98
## Time difference of 0.2065704 secs
## [1] 99
## Time difference of 0.2056341 secs
## [1] 100
## Time difference of 0.2069664 secs
## [1] "Build return matrix"
## [1] "Done"
## Time difference of 20.92777 secs
## Formating the data as long ##
long = ensemble_result %>%
as.data.frame() %>%
tibble::rownames_to_column("model") %>%
pivot_longer(2:length(.),names_to = "row_id")
## Extracting top metabolites ##
top = long %>%
group_by(row_id) %>%
dplyr::summarise(sum = sum(value)) %>%
dplyr::arrange(-sum)
## Pulling top N metabolites ##
topn = top[1:10,] %>%
pull(row_id)
#writing file contining the topn row ids
topn_metabolomics = tibble(row_id = topn)
write_csv(topn_metabolomics,"01_r_processed_data/top10_infected_v_healthy_metabolomics_biomarkers.csv")
## Filtering the data, combining annotation names ##
filtered = long %>%
filter(row_id %in% topn) %>%
inner_join(gnps_annotations,by = "row_id")
## Plotting the data ##
f3d = filtered %>%
ggplot(aes(reorder(compound_name_cleaned,value),value,fill = model))+
geom_col()+
coord_flip()+
viridis::scale_fill_viridis(discrete = TRUE)+
ggtitle("Top 10 Performing Metabolite Biomarkers of Infection")+
xlab("Metabolite Biomarkers")
f3dggsave("02_figures/f3_infection_v_healthy_metabolomics/f3d.pdf",f3d,height = 8,width = 8,units = "in")## Arranging Data so it is in the correct order ###
ROC_data = dplyr::filter(normalized_data,row_id %in% topn) %>%
dplyr::arrange(factor(row_id,topn)) %>%
inner_join(gnps_annotations,by = c("row_id" = "row_id"))
source("functions.R")
f3e = make_roc_metabolomics(ROC_data)ggsave("02_figures/f3_infection_v_healthy_metabolomics/f3e.pdf",f3e,height=7.2,width=19.2)faecalis_v_faecium = readr::read_csv("01_r_processed_data/faecalis_v_faecium.csv") %>%
left_join(protein_to_gene,by = c("uniprotid" = "ProteinID"))
f4a = faecalis_v_faecium %>%
ggplot(aes(log2FC,-log10(adj.pvalue)))+
geom_point()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed",color = "red")+
xlab("Log2FC Faecalis / Faecium")+
scale_colour_viridis_d()+
theme(legend.position = "bottom")
f4aggsave("02_figures/f4_faecalis_v_faecium_proteomics/f4a.pdf",f4a,height = 6, width = 8)up_in_faecalis = faecalis_v_faecium %>%
filter(log2FC>0) %>%
filter(adj.pvalue <= 0.05) %>%
arrange(adj.pvalue) %>%
pull(Gene)
#Getting all the go annotations
go = read_csv("00_r_input_data/go_annotations.csv") %>%
dplyr::select(1,length(.)) %>%
tidyr::separate_rows(Gene.Ontology.IDs, sep = ';') %>%
dplyr::mutate(Gene.Ontology.IDs = gsub(" ","",Gene.Ontology.IDs)) %>%
# Going from Protein ID to gene ID
left_join(protein_to_gene,by = c("Entry" = "ProteinID")) %>%
select(-Entry) %>%
select(Entry = Gene,Gene.Ontology.IDs)
#usng enricher
TERM2GENE = go %>%
dplyr::select(TERM = Gene.Ontology.IDs,GENE = Entry) %>%
as.data.frame()
TERM2NAME = AnnotationDbi::select(GO.db::GO.db,keys = unique(TERM2GENE$TERM),columns = c("TERM")) %>%
as.data.frame()
enrich = clusterProfiler::enricher(up_in_faecalis,
universe = go$Entry,
TERM2GENE = TERM2GENE,
TERM2NAME = TERM2NAME)
f4b = clusterProfiler::cnetplot(enrich,
categorySize="pvalue",
cex_label_category = 2.5,
showCategory = 100)
f4bggsave("02_figures/f4_faecalis_v_faecium_proteomics/f4b.pdf",height = 15, width = 15)up_in_faecium = faecalis_v_faecium %>%
filter(log2FC<0) %>%
filter(adj.pvalue <= 0.05) %>%
arrange(adj.pvalue) %>%
pull(Gene)
enrich = clusterProfiler::enricher(up_in_faecium,
universe = go$Entry,
TERM2GENE = TERM2GENE,
TERM2NAME = TERM2NAME)
f4c = clusterProfiler::cnetplot(enrich,
categorySize="pvalue",
cex_label_category = 2.5,
showCategory = 100)
f4cggsave("02_figures/f4_faecalis_v_faecium_proteomics/f4c.pdf",f4c,height = 10, width = 10)library(EFS)
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
clean = pl %>%
dplyr::group_by(Mixture,Gene,Channel,Condition) %>%
dplyr::summarise(sum = sum(Abundance))
faecium_v_faecalis_l = clean %>%
filter(Condition %in% c("Faecium", "Faecalis")) %>%
mutate(classlabels = case_when(Condition == "Faecalis" ~ 1,
TRUE ~ 0)) %>%
# EFS can't handle the - charachter.
mutate(Gene = gsub("-","_",Gene))
faecium_v_faecalis_w = faecium_v_faecalis_l %>%
pivot_wider(names_from = Gene,
values_from = sum) %>%
mutate(sampleid = paste0(Mixture,".",Channel)) %>%
ungroup() %>%
dplyr::select(-Mixture,-Channel,-Condition,-sampleid) %>%
select_if(~ !any(is.na(.))) %>%
as.data.frame()
#need to come up with some sort of way to combine all of the fractions and plexes
# The problem is that na.omit filters everything out.
ensemble_result = EFS::ensemble_fs(faecium_v_faecalis_w,1,cor_threshold = 0)## [1] "default value for NA_threshold = 0.2"
## [1] "default value for runs = 100"
## [1] "default value for selection is c(TRUE, TRUE, TRUE,TRUE, TRUE, TRUE, FALSE, FALSE)"
## [1] "Start Median"
## [1] "Start Pearson"
## [1] "Start Spearman"
## [1] "Start LogReg"
## [1] "Start RF"
## [1] 1
## Time difference of 0.3025377 secs
## [1] 2
## Time difference of 0.3023775 secs
## [1] 3
## Time difference of 0.3013721 secs
## [1] 4
## Time difference of 0.3018656 secs
## [1] 5
## Time difference of 0.3024657 secs
## [1] 6
## Time difference of 0.3019772 secs
## [1] 7
## Time difference of 0.3027401 secs
## [1] 8
## Time difference of 0.302598 secs
## [1] 9
## Time difference of 0.3030314 secs
## [1] 10
## Time difference of 0.3027594 secs
## [1] 11
## Time difference of 0.3028541 secs
## [1] 12
## Time difference of 0.3015192 secs
## [1] 13
## Time difference of 0.3019595 secs
## [1] 14
## Time difference of 0.3016484 secs
## [1] 15
## Time difference of 0.3024187 secs
## [1] 16
## Time difference of 0.3224609 secs
## [1] 17
## Time difference of 0.304636 secs
## [1] 18
## Time difference of 0.3050363 secs
## [1] 19
## Time difference of 0.30246 secs
## [1] 20
## Time difference of 0.301471 secs
## [1] 21
## Time difference of 0.3027263 secs
## [1] 22
## Time difference of 0.3034954 secs
## [1] 23
## Time difference of 0.3015976 secs
## [1] 24
## Time difference of 0.302814 secs
## [1] 25
## Time difference of 0.3016601 secs
## [1] 26
## Time difference of 0.3012161 secs
## [1] 27
## Time difference of 0.3015015 secs
## [1] 28
## Time difference of 0.3027792 secs
## [1] 29
## Time difference of 0.3029506 secs
## [1] 30
## Time difference of 0.3030524 secs
## [1] 31
## Time difference of 0.3032999 secs
## [1] 32
## Time difference of 0.301285 secs
## [1] 33
## Time difference of 0.3014643 secs
## [1] 34
## Time difference of 0.3015864 secs
## [1] 35
## Time difference of 0.3021483 secs
## [1] 36
## Time difference of 0.3014917 secs
## [1] 37
## Time difference of 0.3029888 secs
## [1] 38
## Time difference of 0.3019943 secs
## [1] 39
## Time difference of 0.3019638 secs
## [1] 40
## Time difference of 0.3010771 secs
## [1] 41
## Time difference of 0.3011096 secs
## [1] 42
## Time difference of 0.3025193 secs
## [1] 43
## Time difference of 0.30269 secs
## [1] 44
## Time difference of 0.302659 secs
## [1] 45
## Time difference of 0.3020029 secs
## [1] 46
## Time difference of 0.3012071 secs
## [1] 47
## Time difference of 0.301048 secs
## [1] 48
## Time difference of 0.3018038 secs
## [1] 49
## Time difference of 0.3022287 secs
## [1] 50
## Time difference of 0.3035562 secs
## [1] 51
## Time difference of 0.3027992 secs
## [1] 52
## Time difference of 0.3024087 secs
## [1] 53
## Time difference of 0.3022249 secs
## [1] 54
## Time difference of 0.3003185 secs
## [1] 55
## Time difference of 0.3019459 secs
## [1] 56
## Time difference of 0.3042529 secs
## [1] 57
## Time difference of 0.3030276 secs
## [1] 58
## Time difference of 0.303436 secs
## [1] 59
## Time difference of 0.301291 secs
## [1] 60
## Time difference of 0.3026381 secs
## [1] 61
## Time difference of 0.3016512 secs
## [1] 62
## Time difference of 0.3019788 secs
## [1] 63
## Time difference of 0.3037755 secs
## [1] 64
## Time difference of 0.3031068 secs
## [1] 65
## Time difference of 0.3014209 secs
## [1] 66
## Time difference of 0.3025939 secs
## [1] 67
## Time difference of 0.3229866 secs
## [1] 68
## Time difference of 0.302778 secs
## [1] 69
## Time difference of 0.3022819 secs
## [1] 70
## Time difference of 0.3037021 secs
## [1] 71
## Time difference of 0.3025043 secs
## [1] 72
## Time difference of 0.301703 secs
## [1] 73
## Time difference of 0.3018141 secs
## [1] 74
## Time difference of 0.30281 secs
## [1] 75
## Time difference of 0.3017972 secs
## [1] 76
## Time difference of 0.3012011 secs
## [1] 77
## Time difference of 0.30234 secs
## [1] 78
## Time difference of 0.3013129 secs
## [1] 79
## Time difference of 0.3008969 secs
## [1] 80
## Time difference of 0.3015954 secs
## [1] 81
## Time difference of 0.303174 secs
## [1] 82
## Time difference of 0.3030908 secs
## [1] 83
## Time difference of 0.3027947 secs
## [1] 84
## Time difference of 0.3020053 secs
## [1] 85
## Time difference of 0.3020568 secs
## [1] 86
## Time difference of 0.3029008 secs
## [1] 87
## Time difference of 0.3030374 secs
## [1] 88
## Time difference of 0.3029613 secs
## [1] 89
## Time difference of 0.3017976 secs
## [1] 90
## Time difference of 0.3019817 secs
## [1] 91
## Time difference of 0.3032191 secs
## [1] 92
## Time difference of 0.3046863 secs
## [1] 93
## Time difference of 0.3024049 secs
## [1] 94
## Time difference of 0.301939 secs
## [1] 95
## Time difference of 0.3021266 secs
## [1] 96
## Time difference of 0.3032155 secs
## [1] 97
## Time difference of 0.302983 secs
## [1] 98
## Time difference of 0.301034 secs
## [1] 99
## Time difference of 0.3016176 secs
## [1] 100
## Time difference of 0.3011901 secs
## [1] "Build return matrix"
## [1] "Done"
## Time difference of 30.3916 secs
long = ensemble_result %>%
as.data.frame() %>%
#Converting gene names back
tibble::rownames_to_column("model") %>%
pivot_longer(2:length(.),names_to = "Gene") %>%
mutate(Gene = gsub("_","-",Gene))
top = long %>%
group_by(Gene) %>%
dplyr::summarise(sum = sum(value)) %>%
dplyr::arrange(-sum)
topn = top[1:10,] %>%
pull(Gene)
# writing to file
topn_proteomics = tibble(Gene = topn)
write_csv(topn_proteomics,"01_r_processed_data/top10_faecalis_v_faecium_proteomics_biomarkers.csv")
f4d = long %>%
filter(Gene %in% topn) %>%
ggplot(aes(reorder(Gene,value),value,fill = model))+
geom_col()+
coord_flip()+
viridis::scale_fill_viridis(discrete = TRUE)+
ggtitle("Top 10 Biomarkers Differentiating Faecalis from Faecium by EFS")+
xlab("Protein Biomarkers")
f4dggsave("02_figures/f4_faecalis_v_faecium_proteomics/f4d.pdf",f4d,height = 8,width = 8)ROC_data = dplyr::filter(protein_sum,Gene %in% topn) %>%
dplyr::filter(Condition %in% c("Faecalis","Faecium"))
source("functions.R")
f4e = make_roc_proteomics(ROC_data,topn)ggsave("02_figures/f4_faecalis_v_faecium_proteomics/f4e.pdf",f4e,height=6,width=16)## Reading in Data
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
# Performing statistics so we can make a volcano plot
stat = normalized_data %>%
filter(condition %in% c("faecalis","faecium")) %>%
dplyr::group_by(row_id) %>%
rstatix::t_test(norm_value ~ condition) %>%
rstatix::adjust_pvalue(p.col = "p",output.col = "p.adj_fdr", method = "fdr")
# Calculating log2fc
log2fc_data = normalized_data %>%
dplyr::select(metabolomics_id,row_id,condition,norm_value)
# Extracting just the faecalis data
faecalis = log2fc_data %>%
filter(condition == "faecalis") %>%
group_by(row_id) %>%
summarise(faecalis = mean(norm_value))
# Extracting just the faecium data
faecium = log2fc_data %>%
filter(condition == "faecium") %>%
group_by(row_id) %>%
summarise(faecium = mean(norm_value))
# Calculating log2fc
log2fc = inner_join(faecalis,faecium,by = "row_id") %>%
mutate(log2fc_faecalis_faecium = log2(faecalis/faecium))
# Combining log2fc and statistics
vp = inner_join(stat,log2fc,by = "row_id") %>%
mutate(annotated = case_when(row_id %in% gnps_annotations$row_id ~ TRUE,
TRUE ~ FALSE))
annotations = vp %>%
filter(p.adj_fdr <= 0.05,
annotated == TRUE) %>%
inner_join(gnps_annotations, by = "row_id")
f5a = vp %>%
ggplot(aes(log2fc_faecalis_faecium,-log10(p.adj_fdr),color = annotated))+
geom_point()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed",color = "red")+
xlab("log2FC (Faecalis/Faecium)")+
theme(legend.position = "bottom")+
ggrepel::geom_label_repel(data = annotations,aes(label = compound_name_cleaned,x= log2fc_faecalis_faecium, y = -log10(p.adj_fdr)),)
f5aggsave("02_figures/f5_faecalis_v_faecium_metabolomics/f5a.pdf",f5a,height = 6, width =8)## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv") %>%
#Removing reduntant RIDs. These were found to be the same as other top biomarkers
filter(row_id != "RID_8856",
row_id != "RID_22529",
row_id != "RID_22270",
row_id != "RID_9436")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
## Switching to the wide format ###
norm_wide = normalized_data %>%
select(-row_m_z,-row_retention_time,-value) %>%
tidyr::pivot_wider(names_from = row_id,values_from = norm_value)
## Looking for only columns that were annotated ##
gnps_and_in_norm = intersect(unique(gnps_annotations$row_id), colnames(norm_wide))
gnps_annotated_metabolites = norm_wide %>%
as.data.frame() %>%
select(condition,gnps_and_in_norm)
## Extracting the numeric data ##
efs_sel = gnps_annotated_metabolites %>%
filter(condition %in% c("faecalis","faecium")) %>%
mutate(classlabels = case_when(condition == "faecalis" ~ 0,
TRUE ~ 1)) %>%
select(classlabels,2:length(.)) %>%
#absolutely needs to be a data frame and not a tibble for EFS package
as.data.frame()
# Seting seed incase this isn't done internally in EFS
set.seed(2)
## Running EFS feature selection ##
ensemble_result = EFS::ensemble_fs(efs_sel,
classnumber = 1,
cor_threshold = 0)## [1] "default value for NA_threshold = 0.2"
## [1] "default value for runs = 100"
## [1] "default value for selection is c(TRUE, TRUE, TRUE,TRUE, TRUE, TRUE, FALSE, FALSE)"
## [1] "Start Median"
## [1] "Start Pearson"
## [1] "Start Spearman"
## [1] "Start LogReg"
## [1] "Start RF"
## [1] 1
## Time difference of 0.2534938 secs
## [1] 2
## Time difference of 0.2499523 secs
## [1] 3
## Time difference of 0.2506042 secs
## [1] 4
## Time difference of 0.2502916 secs
## [1] 5
## Time difference of 0.2610362 secs
## [1] 6
## Time difference of 0.2495589 secs
## [1] 7
## Time difference of 0.2484431 secs
## [1] 8
## Time difference of 0.2486982 secs
## [1] 9
## Time difference of 0.249088 secs
## [1] 10
## Time difference of 0.2493234 secs
## [1] 11
## Time difference of 0.2477839 secs
## [1] 12
## Time difference of 0.249017 secs
## [1] 13
## Time difference of 0.2485266 secs
## [1] 14
## Time difference of 0.2483208 secs
## [1] 15
## Time difference of 0.2483742 secs
## [1] 16
## Time difference of 0.2493215 secs
## [1] 17
## Time difference of 0.2488618 secs
## [1] 18
## Time difference of 0.2487681 secs
## [1] 19
## Time difference of 0.2476819 secs
## [1] 20
## Time difference of 0.2492435 secs
## [1] 21
## Time difference of 0.2488761 secs
## [1] 22
## Time difference of 0.2485516 secs
## [1] 23
## Time difference of 0.2497113 secs
## [1] 24
## Time difference of 0.2495334 secs
## [1] 25
## Time difference of 0.2501955 secs
## [1] 26
## Time difference of 0.2560887 secs
## [1] 27
## Time difference of 0.2481182 secs
## [1] 28
## Time difference of 0.2489748 secs
## [1] 29
## Time difference of 0.2485123 secs
## [1] 30
## Time difference of 0.2476974 secs
## [1] 31
## Time difference of 0.2478409 secs
## [1] 32
## Time difference of 0.2477689 secs
## [1] 33
## Time difference of 0.2482469 secs
## [1] 34
## Time difference of 0.2488599 secs
## [1] 35
## Time difference of 0.2495916 secs
## [1] 36
## Time difference of 0.2481334 secs
## [1] 37
## Time difference of 0.2484608 secs
## [1] 38
## Time difference of 0.248378 secs
## [1] 39
## Time difference of 0.248677 secs
## [1] 40
## Time difference of 0.2466803 secs
## [1] 41
## Time difference of 0.2478027 secs
## [1] 42
## Time difference of 0.2475133 secs
## [1] 43
## Time difference of 0.2479322 secs
## [1] 44
## Time difference of 0.247762 secs
## [1] 45
## Time difference of 0.2481294 secs
## [1] 46
## Time difference of 0.2542868 secs
## [1] 47
## Time difference of 0.2486823 secs
## [1] 48
## Time difference of 0.2485063 secs
## [1] 49
## Time difference of 0.2487452 secs
## [1] 50
## Time difference of 0.2494619 secs
## [1] 51
## Time difference of 0.2477829 secs
## [1] 52
## Time difference of 0.2476428 secs
## [1] 53
## Time difference of 0.2486858 secs
## [1] 54
## Time difference of 0.2494276 secs
## [1] 55
## Time difference of 0.2491021 secs
## [1] 56
## Time difference of 0.2481556 secs
## [1] 57
## Time difference of 0.2494004 secs
## [1] 58
## Time difference of 0.2490067 secs
## [1] 59
## Time difference of 0.2482679 secs
## [1] 60
## Time difference of 0.2484646 secs
## [1] 61
## Time difference of 0.2486014 secs
## [1] 62
## Time difference of 0.2475164 secs
## [1] 63
## Time difference of 0.2475064 secs
## [1] 64
## Time difference of 0.2484939 secs
## [1] 65
## Time difference of 0.2488475 secs
## [1] 66
## Time difference of 0.2511301 secs
## [1] 67
## Time difference of 0.2552962 secs
## [1] 68
## Time difference of 0.2496622 secs
## [1] 69
## Time difference of 0.2482533 secs
## [1] 70
## Time difference of 0.2486625 secs
## [1] 71
## Time difference of 0.2492678 secs
## [1] 72
## Time difference of 0.2485709 secs
## [1] 73
## Time difference of 0.2483342 secs
## [1] 74
## Time difference of 0.2486928 secs
## [1] 75
## Time difference of 0.2485778 secs
## [1] 76
## Time difference of 0.2483637 secs
## [1] 77
## Time difference of 0.2480485 secs
## [1] 78
## Time difference of 0.2479527 secs
## [1] 79
## Time difference of 0.2488806 secs
## [1] 80
## Time difference of 0.2468014 secs
## [1] 81
## Time difference of 0.2487602 secs
## [1] 82
## Time difference of 0.2479839 secs
## [1] 83
## Time difference of 0.2481003 secs
## [1] 84
## Time difference of 0.2481091 secs
## [1] 85
## Time difference of 0.2498031 secs
## [1] 86
## Time difference of 0.2480178 secs
## [1] 87
## Time difference of 0.2498536 secs
## [1] 88
## Time difference of 0.2557306 secs
## [1] 89
## Time difference of 0.2488866 secs
## [1] 90
## Time difference of 0.2491601 secs
## [1] 91
## Time difference of 0.2490942 secs
## [1] 92
## Time difference of 0.2487257 secs
## [1] 93
## Time difference of 0.2488418 secs
## [1] 94
## Time difference of 0.2481315 secs
## [1] 95
## Time difference of 0.248893 secs
## [1] 96
## Time difference of 0.2483821 secs
## [1] 97
## Time difference of 0.2490871 secs
## [1] 98
## Time difference of 0.2481277 secs
## [1] 99
## Time difference of 0.2487686 secs
## [1] 100
## Time difference of 0.2495427 secs
## [1] "Build return matrix"
## [1] "Done"
## Time difference of 25.00086 secs
## Formating the data as long ##
long = ensemble_result %>%
as.data.frame() %>%
tibble::rownames_to_column("model") %>%
pivot_longer(2:length(.),names_to = "row_id") %>%
inner_join(gnps_annotations,by = "row_id")
## Extracting top metabolites ##
top = long %>%
group_by(row_id,compound_name_cleaned) %>%
dplyr::summarise(sum = sum(value)) %>%
dplyr::arrange(-sum)
## Pulling top N metabolites ##
topn = top[1:10,] %>%
pull(row_id)
topn_metabolomics = tibble(row_id = topn)
write_csv(topn_metabolomics,"01_r_processed_data/top10_faecalis_v_faecium_metabolomics_biomarkers.csv")
## Filtering the data, combining annotation names ##
filtered = long %>%
filter(row_id %in% topn)
## Plotting the data ##
f5b = filtered %>%
ggplot(aes(reorder(compound_name_cleaned,value),value,fill = model))+
geom_col()+
coord_flip()+
viridis::scale_fill_viridis(discrete = TRUE)+
xlab("Metabolite Biomarkers")+
theme(legend.position = "bottom")
f5bggsave("02_figures/f5_faecalis_v_faecium_metabolomics/f5b.pdf",f5b,height = 8,width = 8)mycophenolic acid and pc(17:1/0:0 + c25h51n1o7p1) were detected as two seperate row IDs. They were both found to be among the best biomarkers independantly from each other.
ROC_data = dplyr::filter(normalized_data,row_id %in% topn) %>%
dplyr::arrange(factor(row_id,topn)) %>%
inner_join(gnps_annotations,by = "row_id") %>%
filter(condition != "healthy") %>%
mutate(infection_status = condition)
f5c = make_roc_metabolomics(ROC_data)ggsave("02_figures/f5_faecalis_v_faecium_metabolomics/f5c.pdf",f5c,height = 9, width = 24)##Proteomics data ##
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv") %>%
select(tmt_id = sample_id,everything())
## Clinical metadata ##
map = readxl::read_excel("00_r_input_data/proteomics_sample_mapping.xlsx") %>%
mutate(tmt_id = paste0("plex",plex_number,".",TMT_channel)) %>%
select(tmt_id,sample_id = sample_name)
md = read_csv("00_r_input_data/clinical_metadata.csv")
md_final = inner_join(map,md,by = "sample_id")
clean_clinical = inner_join(pl,md_final, by = "tmt_id")
clean_clinical_wide = clean_clinical %>%
select(sample_id,death_during_admission,Gene,Abundance) %>%
tidyr::pivot_wider(names_from = Gene,
values_from = Abundance) %>%
#removing the samples without death during admission info
filter(!is.na(death_during_admission)) %>%
#filter(sample_id.y != "S49",
#sample_id.y != "S76") %>%
dplyr::select_if(~ !any(is.na(.))) %>%
#needs to be a dataframe or EFS freaks out.
as.data.frame()
### Combining all data ###
all= clean_clinical_wide
# Getting into a format where we can do the clustering ##
clust_data = all[,3:283] %>%
#Remove columns that only contain 0s
select_if(colSums(.) != 0) %>%
as.data.frame()
annotation = all[,2] %>%
as.data.frame()
## Fixing the row names so they match for the heatmap ##
rownames(annotation) = all$sample_id
rownames(clust_data) = all$sample_id
#calculating distance matrix
d = dist(clust_data)
## Generating a dendrogram without a heatmap ##
library(dendextend)
dend = as.dendrogram(hclust(d, method = "ward.D2"))
# let's add some color: Red = dead, black = okay
colors_to_use = c("#000000","#FF0000")[as.numeric(as.factor(all$death_during_admission))]
#getting in the same order as dendrogram
colors_to_use = colors_to_use[order.dendrogram(dend)]
# Now we can color the brancehes
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd", 8)
# Plotting for manuscript
pdf("02_figures/f6_prediction_of_clinical_outcome_proteomics/f6a.pdf",height = 8,
width = 12)
plot(dend)
dend %>% rect.dendrogram(k=2,
border = 8, lty = 5, lwd = 2)
dev.off()## png
## 2
#Plotting for this document
plot(dend)
dend %>% rect.dendrogram(k=2,
border = 8, lty = 5, lwd = 2)# Checking stats to see if there are significant differences between clusters when k=2
xtab <- as.table(rbind(c(4, 8), c(13, 53)))
dimnames(xtab) <- list(
group = c("k1", "k2"),
death = c("yes", "no")
)
fisher_test(xtab)### Need to rerun msstats, this time changing the condition to be death upon admission ###
# read in clincal metadata
cmd = read_csv("00_r_input_data/clinical_metadata.csv") %>%
select(BioReplicate = sample_id,death_during_admission)
# Read MSstats.csv file.
data = read_csv("00_r_input_data/msstats.csv")
#Note that the condition of the bridge channel needs to be labeled as Norm
annotation = read_csv("00_r_input_data/MSstatsTMT_annotation.csv") %>%
inner_join(cmd,by = "BioReplicate") %>%
select(-Condition) %>%
rename(Condition = death_during_admission) %>%
mutate(Condition = as.factor(as.character(Condition))) %>%
mutate(Condition = case_when(is.na(Condition) ~ "unknown",
TRUE ~ Condition))
mstats = MSstatsTMT::PhilosophertoMSstatsTMTFormat(input = data,annotation)
# use MSstats for protein summarization
quant.msstats = MSstatsTMT::proteinSummarization(mstats,
method="msstats",
global_norm=TRUE,
reference_norm=TRUE,
remove_norm_channel = TRUE,
remove_empty_channel = TRUE)
# Taking the pl_data and formatting appropriately
pl_data = quant.msstats$ProteinLevelData %>%
tidyr::separate(Protein, c("orientation","uniprotid","locus"),sep ="\\|") %>%
# Converting to gene name
inner_join(protein_to_gene,by = c("uniprotid" = "ProteinID")) %>%
mutate(sample_id = paste0(Mixture,".",Channel))
write_csv(pl_data,"01_r_processed_data/quant_msstats_protein_level_data_death.csv")
### Preforming the statistics ###
test.pairwise = MSstatsTMT::groupComparisonTMT(quant.msstats,
moderated = TRUE)
stats =test.pairwise$ComparisonResult %>%
filter(Label == "FALSE vs TRUE") %>%
separate(Protein, c("orientation","uniprotid","locus"),sep ="\\|") %>%
inner_join(protein_to_gene,by = c("uniprotid" = "ProteinID"))
write_csv(stats,"01_r_processed_data/death_vs_life.csv")dvl = read_csv("01_r_processed_data/death_vs_life.csv") %>%
filter(is.na(issue))
f6b = dvl %>%
ggplot(aes(log2FC,-log10(adj.pvalue)))+
geom_point()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed",color = "red")+
xlab("Log2FC (Live/Dead)")
f6bggsave("02_figures/f6_prediction_of_clinical_outcome_proteomics/f6b.pdf",f6b,height = 6, width = 8)
# Extracting the signficiant proteins from the volcano plot for later use.
sig = dvl %>%
filter(adj.pvalue <= 0.05)
up_in_live = sig %>%
filter(log2FC>0) %>%
pull(Gene)
up_in_dead = sig %>%
filter(log2FC<0) %>%
pull(Gene)#Getting all the go annotations
go = read_csv("00_r_input_data/go_annotations.csv") %>%
dplyr::select(1,length(.)) %>%
tidyr::separate_rows(Gene.Ontology.IDs, sep = ';') %>%
dplyr::mutate(Gene.Ontology.IDs = gsub(" ","",Gene.Ontology.IDs)) %>%
# Going from Protein ID to gene ID
left_join(protein_to_gene,by = c("Entry" = "ProteinID")) %>%
select(-Entry) %>%
select(Entry = Gene,Gene.Ontology.IDs)
#usng enricher
TERM2GENE = go %>%
dplyr::select(TERM = Gene.Ontology.IDs,GENE = Entry) %>%
as.data.frame()
TERM2NAME = AnnotationDbi::select(GO.db::GO.db,keys = unique(TERM2GENE$TERM),columns = c("TERM")) %>%
as.data.frame()
# Perfomring GO enrichment
enrich = clusterProfiler::enricher(up_in_dead,
universe = go$Entry,
TERM2GENE = TERM2GENE,
TERM2NAME = TERM2NAME)
f6c = clusterProfiler::cnetplot(enrich, categorySize="pvalue",circular = FALSE,categorySize="pvalue",cex_label_category = 2.5,showCategory = 100)
f6c ggsave("02_figures/f6_prediction_of_clinical_outcome_proteomics/f6c.pdf",f6c)#Getting all the go annotations
go = read_csv("00_r_input_data/go_annotations.csv") %>%
dplyr::select(1,length(.)) %>%
tidyr::separate_rows(Gene.Ontology.IDs, sep = ';') %>%
dplyr::mutate(Gene.Ontology.IDs = gsub(" ","",Gene.Ontology.IDs)) %>%
# Going from Protein ID to gene ID
left_join(protein_to_gene,by = c("Entry" = "ProteinID")) %>%
select(-Entry) %>%
select(Entry = Gene,Gene.Ontology.IDs)
#usng enricher
TERM2GENE = go %>%
dplyr::select(TERM = Gene.Ontology.IDs,GENE = Entry) %>%
as.data.frame()
TERM2NAME = AnnotationDbi::select(GO.db::GO.db,keys = unique(TERM2GENE$TERM),columns = c("TERM")) %>%
as.data.frame()
# Perfomring GO enrichment
enrich = clusterProfiler::enricher(up_in_live,
universe = go$Entry,
TERM2GENE = TERM2GENE,
TERM2NAME = TERM2NAME)
f6d = clusterProfiler::cnetplot(enrich, categorySize="pvalue",
circular = FALSE,
categorySize="pvalue",
cex_label_category = 2.5,
showCategory = 100)
f6dggsave("02_figures/f6_prediction_of_clinical_outcome_proteomics/f6d.pdf",f6d)##reading in the data##
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv") %>%
rename(tmt_id = sample_id)
## Clinical metadata ##
map = readxl::read_excel("00_r_input_data/proteomics_sample_mapping.xlsx") %>%
mutate(tmt_id = paste0("plex",plex_number,".",TMT_channel)) %>%
select(tmt_id,sample_id = sample_name)
md = read_csv("00_r_input_data/clinical_metadata.csv")
md_final = inner_join(map,md,by = "sample_id")
clean_clinical = inner_join(pl,md_final, by = "tmt_id") %>%
# Don't need the healthy samples here
filter(Condition != "Healthy") %>%
# - messes up EFS for some reason, very annoying.
mutate(Gene = gsub("-","_",Gene)) %>%
select(sample_id,death_during_admission,Gene,Abundance) %>%
tidyr::pivot_wider(names_from = Gene,
values_from = Abundance) %>%
#Removing samples with NA.
filter(!is.na(death_during_admission)) %>%
dplyr::select_if(~ !any(is.na(.))) %>%
#needs to be a dataframe or EFS freaks out.
as.data.frame() %>%
select(-sample_id)
ensemble_result = EFS::ensemble_fs(clean_clinical,1,cor_threshold = 0)## [1] "default value for NA_threshold = 0.2"
## [1] "default value for runs = 100"
## [1] "default value for selection is c(TRUE, TRUE, TRUE,TRUE, TRUE, TRUE, FALSE, FALSE)"
## [1] "Start Median"
## [1] "Start Pearson"
## [1] "Start Spearman"
## [1] "Start LogReg"
## [1] "Start RF"
## [1] 1
## Time difference of 0.2373149 secs
## [1] 2
## Time difference of 0.2398059 secs
## [1] 3
## Time difference of 0.2382245 secs
## [1] 4
## Time difference of 0.2468762 secs
## [1] 5
## Time difference of 0.2426667 secs
## [1] 6
## Time difference of 0.2395501 secs
## [1] 7
## Time difference of 0.2391062 secs
## [1] 8
## Time difference of 0.2375779 secs
## [1] 9
## Time difference of 0.2386758 secs
## [1] 10
## Time difference of 0.2379889 secs
## [1] 11
## Time difference of 0.237339 secs
## [1] 12
## Time difference of 0.2403865 secs
## [1] 13
## Time difference of 0.2379389 secs
## [1] 14
## Time difference of 0.2365415 secs
## [1] 15
## Time difference of 0.2383988 secs
## [1] 16
## Time difference of 0.2394145 secs
## [1] 17
## Time difference of 0.23757 secs
## [1] 18
## Time difference of 0.2378747 secs
## [1] 19
## Time difference of 0.2450583 secs
## [1] 20
## Time difference of 0.239738 secs
## [1] 21
## Time difference of 0.239192 secs
## [1] 22
## Time difference of 0.2388492 secs
## [1] 23
## Time difference of 0.2383192 secs
## [1] 24
## Time difference of 0.2390654 secs
## [1] 25
## Time difference of 0.2384799 secs
## [1] 26
## Time difference of 0.2367435 secs
## [1] 27
## Time difference of 0.2387636 secs
## [1] 28
## Time difference of 0.2382054 secs
## [1] 29
## Time difference of 0.2367411 secs
## [1] 30
## Time difference of 0.2386441 secs
## [1] 31
## Time difference of 0.2385774 secs
## [1] 32
## Time difference of 0.2390773 secs
## [1] 33
## Time difference of 0.2443857 secs
## [1] 34
## Time difference of 0.238708 secs
## [1] 35
## Time difference of 0.2388959 secs
## [1] 36
## Time difference of 0.2380121 secs
## [1] 37
## Time difference of 0.2382066 secs
## [1] 38
## Time difference of 0.237112 secs
## [1] 39
## Time difference of 0.2383742 secs
## [1] 40
## Time difference of 0.238616 secs
## [1] 41
## Time difference of 0.2385745 secs
## [1] 42
## Time difference of 0.2367756 secs
## [1] 43
## Time difference of 0.2388709 secs
## [1] 44
## Time difference of 0.2382045 secs
## [1] 45
## Time difference of 0.2373645 secs
## [1] 46
## Time difference of 0.2375965 secs
## [1] 47
## Time difference of 0.2386129 secs
## [1] 48
## Time difference of 0.2709024 secs
## [1] 49
## Time difference of 0.2401197 secs
## [1] 50
## Time difference of 0.2389624 secs
## [1] 51
## Time difference of 0.238574 secs
## [1] 52
## Time difference of 0.2383761 secs
## [1] 53
## Time difference of 0.2386229 secs
## [1] 54
## Time difference of 0.2381034 secs
## [1] 55
## Time difference of 0.2365055 secs
## [1] 56
## Time difference of 0.2397709 secs
## [1] 57
## Time difference of 0.2364321 secs
## [1] 58
## Time difference of 0.2395811 secs
## [1] 59
## Time difference of 0.2390161 secs
## [1] 60
## Time difference of 0.238034 secs
## [1] 61
## Time difference of 0.239666 secs
## [1] 62
## Time difference of 0.2389221 secs
## [1] 63
## Time difference of 0.2469056 secs
## [1] 64
## Time difference of 0.2400572 secs
## [1] 65
## Time difference of 0.2369964 secs
## [1] 66
## Time difference of 0.2402818 secs
## [1] 67
## Time difference of 0.2382226 secs
## [1] 68
## Time difference of 0.2379162 secs
## [1] 69
## Time difference of 0.2391052 secs
## [1] 70
## Time difference of 0.2384758 secs
## [1] 71
## Time difference of 0.2383642 secs
## [1] 72
## Time difference of 0.2394919 secs
## [1] 73
## Time difference of 0.2390232 secs
## [1] 74
## Time difference of 0.2386019 secs
## [1] 75
## Time difference of 0.2394998 secs
## [1] 76
## Time difference of 0.238121 secs
## [1] 77
## Time difference of 0.2389491 secs
## [1] 78
## Time difference of 0.2379348 secs
## [1] 79
## Time difference of 0.2485383 secs
## [1] 80
## Time difference of 0.2386885 secs
## [1] 81
## Time difference of 0.2384729 secs
## [1] 82
## Time difference of 0.2383795 secs
## [1] 83
## Time difference of 0.2386177 secs
## [1] 84
## Time difference of 0.2383282 secs
## [1] 85
## Time difference of 0.2377071 secs
## [1] 86
## Time difference of 0.2386155 secs
## [1] 87
## Time difference of 0.2391093 secs
## [1] 88
## Time difference of 0.2384124 secs
## [1] 89
## Time difference of 0.2370389 secs
## [1] 90
## Time difference of 0.2391346 secs
## [1] 91
## Time difference of 0.238497 secs
## [1] 92
## Time difference of 0.238467 secs
## [1] 93
## Time difference of 0.237864 secs
## [1] 94
## Time difference of 0.2490449 secs
## [1] 95
## Time difference of 0.2408738 secs
## [1] 96
## Time difference of 0.2385709 secs
## [1] 97
## Time difference of 0.2380588 secs
## [1] 98
## Time difference of 0.238225 secs
## [1] 99
## Time difference of 0.2399116 secs
## [1] 100
## Time difference of 0.2397165 secs
## [1] "Build return matrix"
## [1] "Done"
## Time difference of 24.07907 secs
long = ensemble_result %>%
as.data.frame() %>%
tibble::rownames_to_column("model") %>%
pivot_longer(2:length(.),names_to = "Gene") %>%
#converting genes back to proper notation.
mutate(Gene = gsub("_","-",Gene))
top = long %>%
group_by(Gene) %>%
dplyr::summarise(sum = sum(value)) %>%
dplyr::arrange(-sum)
topn = top[1:10,] %>%
pull(Gene)
topn_out = tibble(Gene = topn)
write_csv(topn_out,"01_r_processed_data/top10_live_v_dead_proteomics_biomarkers.csv")
filtered = long %>%
filter(Gene %in% topn)
EFS = filtered %>%
ggplot(aes(reorder(Gene,value),value,fill = model))+
geom_col()+
coord_flip()+
viridis::scale_fill_viridis(discrete = TRUE)+
ggtitle("Top 10 Performing Protein Biomarkers")+
xlab("Protein Biomarkers")
EFSsource("functions.R")
data = inner_join(pl,md_final, by = "tmt_id") %>%
filter(Condition != "Healthy") %>%
mutate(Condition = death_during_admission) %>%
filter(!is.na(Condition))
f6e = make_roc_proteomics(data,topn)ggsave("02_figures/f6_prediction_of_clinical_outcome_proteomics/f6e.pdf",f6e,height = 6, width = 16)## Metabolomics data
## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data//normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
## Switching to the wide format ###
norm_wide = normalized_data %>%
select(-row_m_z,-row_retention_time,-value) %>%
tidyr::pivot_wider(names_from = row_id,values_from = norm_value) %>%
filter(!is.na(death_during_admission))
# Getting into a format where we can do the clustering ##
clust_data = norm_wide[,78:length(norm_wide)] %>%
#Remove columns that only contain 0s
select_if(colSums(.) != 0) %>%
as.data.frame()
## Fixing the row names so they match for the heatmap ##
rownames(clust_data) = norm_wide$sample_id
annotation = norm_wide$death_during_admission %>%
as.data.frame()
## Generating a dendrogram without a heatmap ##
library(dendextend)
d = dist(clust_data)
dend = as.dendrogram(hclust(d, method = "ward.D2"))
# let's add some color: Red = dead, black = okay
colors_to_use = c("#000000","#FF0000")[as.numeric(as.factor(norm_wide$death_during_admission))]
#getting in the same order as dendrogram
colors_to_use = colors_to_use[order.dendrogram(dend)]
# Now we can color the brancehes
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd", 8)
plot(dend)
dend %>% rect.dendrogram(k=2,
border = 8, lty = 5, lwd = 2)# Plotting for manuscript
pdf("02_figures/f7_prediction_of_clinical_outcome_metabolomics/f7a.pdf",height = 8,width = 12)
plot(dend)
dend %>% rect.dendrogram(k=2,
border = 8, lty = 5, lwd = 2)
dev.off()## png
## 2
xtab <- as.table(rbind(c(4, 8), c(13, 53)))
dimnames(xtab) <- list(
group = c("k1", "k2"),
death = c("yes", "no")
)
fisher_test(xtab)## Metabolomics data
## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data//normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
## Switching to the wide format ###
norm_wide = normalized_data %>%
select(-row_m_z,-row_retention_time,-value) %>%
tidyr::pivot_wider(names_from = row_id,values_from = norm_value) %>%
filter(!is.na(death_during_admission))
## Looking for only columns that were annotated ##
gnps_and_in_norm = intersect(unique(gnps_annotations$row_id), colnames(norm_wide))
gnps_annotated_metabolites = norm_wide %>%
as.data.frame() %>%
select(sample_id,death_during_admission,gnps_and_in_norm)
# Getting into a format where we can do the clustering ##
clust_data = gnps_annotated_metabolites[,3:213] %>%
#Remove columns that only contain 0s
select_if(colSums(.) != 0) %>%
as.data.frame()
## Fixing the row names so they match for the heatmap ##
rownames(clust_data) = norm_wide$sample_id
annotation = norm_wide$death_during_admission %>%
as.data.frame()
## Generating a dendrogram without a heatmap ##
library(dendextend)
d = dist(clust_data)
dend = as.dendrogram(hclust(d, method = "ward.D2"))
# let's add some color: Red = dead, black = okay
colors_to_use = c("#000000","#FF0000")[as.numeric(as.factor(norm_wide$death_during_admission))]
#getting in the same order as dendrogram
colors_to_use = colors_to_use[order.dendrogram(dend)]
# Now we can color the brancehes
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd", 8)
plot(dend)
dend %>% rect.dendrogram(k=2,
border = 8, lty = 5, lwd = 2)# Plotting for manuscript
pdf("02_figures/f7_prediction_of_clinical_outcome_metabolomics/f7b.pdf",height = 8,width = 12)
plot(dend)
dend %>% rect.dendrogram(k=2,
border = 8, lty = 5, lwd = 2)
dev.off()## png
## 2
xtab <- as.table(rbind(c(4, 8), c(13, 53)))
dimnames(xtab) <- list(
group = c("k1", "k2"),
death = c("yes", "no")
)
fisher_test(xtab)## Reading in Data
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv") %>%
filter(!is.na(death_during_admission))
# Performing statistics so we can make a volcano plot
stat = normalized_data %>%
dplyr::group_by(row_id) %>%
rstatix::t_test(norm_value ~ death_during_admission) %>%
rstatix::adjust_pvalue(p.col = "p",output.col = "p.adj_fdr", method = "fdr")
# Calculating log2fc
log2fc_data = normalized_data %>%
dplyr::select(metabolomics_id,row_id,death_during_admission,norm_value)
# Extracting just the faecalis data
dead = log2fc_data %>%
filter(death_during_admission == TRUE) %>%
group_by(row_id) %>%
summarise(dead = mean(norm_value))
# Extracting just the faecium data
live = log2fc_data %>%
filter(death_during_admission == FALSE) %>%
group_by(row_id) %>%
summarise(live = mean(norm_value))
# Calculating log2fc
log2fc = inner_join(dead,live,by = "row_id") %>%
mutate(log2fc_live_dead = log2(live/dead))
# Combining log2fc and statistics
vp = inner_join(stat,log2fc,by = "row_id") %>%
mutate(annotated = case_when(row_id %in% gnps_annotations$row_id ~ TRUE,
TRUE ~ FALSE))
annotations = vp %>%
filter(p.adj_fdr <= 0.05,
annotated == TRUE) %>%
inner_join(gnps_annotations, by = "row_id")
f7c = vp %>%
ggplot(aes(log2fc_live_dead,-log10(p.adj_fdr),color = annotated))+
geom_point()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed",color = "red")+
xlab("log2FC (Live/Dead)")+
theme(legend.position = "bottom")+
ggrepel::geom_label_repel(data = annotations,aes(label = compound_name_cleaned,x= log2fc_live_dead, y = -log10(p.adj_fdr)))
f7cggsave(filename = "02_figures/f7_prediction_of_clinical_outcome_metabolomics/f7c.pdf",f7c,height = 5, width = 8)## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
## Switching to the wide format ###
norm_wide = normalized_data %>%
select(-row_m_z,-row_retention_time,-value) %>%
tidyr::pivot_wider(names_from = row_id,values_from = norm_value)
## Looking for only columns that were annotated ##
gnps_and_in_norm = intersect(unique(gnps_annotations$row_id), colnames(norm_wide))
gnps_annotated_metabolites = norm_wide %>%
as.data.frame() %>%
select(death_during_admission,gnps_and_in_norm) %>%
filter(!is.na(death_during_admission))
# Seting seed incase this isn't done internally in EFS
set.seed(2)
## Running EFS feature selection ##
ensemble_result = EFS::ensemble_fs(gnps_annotated_metabolites,
classnumber = 1,cor_threshold = 0)## [1] "default value for NA_threshold = 0.2"
## [1] "default value for runs = 100"
## [1] "default value for selection is c(TRUE, TRUE, TRUE,TRUE, TRUE, TRUE, FALSE, FALSE)"
## [1] "Start Median"
## [1] "Start Pearson"
## [1] "Start Spearman"
## [1] "Start LogReg"
## [1] "Start RF"
## [1] 1
## Time difference of 0.1942012 secs
## [1] 2
## Time difference of 0.2087853 secs
## [1] 3
## Time difference of 0.1944389 secs
## [1] 4
## Time difference of 0.1950138 secs
## [1] 5
## Time difference of 0.1938634 secs
## [1] 6
## Time difference of 0.1944354 secs
## [1] 7
## Time difference of 0.1947858 secs
## [1] 8
## Time difference of 0.1939828 secs
## [1] 9
## Time difference of 0.1930513 secs
## [1] 10
## Time difference of 0.1936624 secs
## [1] 11
## Time difference of 0.1938617 secs
## [1] 12
## Time difference of 0.1945403 secs
## [1] 13
## Time difference of 0.1933548 secs
## [1] 14
## Time difference of 0.1938479 secs
## [1] 15
## Time difference of 0.1942153 secs
## [1] 16
## Time difference of 0.1934369 secs
## [1] 17
## Time difference of 0.1940291 secs
## [1] 18
## Time difference of 0.1935894 secs
## [1] 19
## Time difference of 0.1939664 secs
## [1] 20
## Time difference of 0.1940956 secs
## [1] 21
## Time difference of 0.1945598 secs
## [1] 22
## Time difference of 0.2015421 secs
## [1] 23
## Time difference of 0.1946578 secs
## [1] 24
## Time difference of 0.1943195 secs
## [1] 25
## Time difference of 0.1933215 secs
## [1] 26
## Time difference of 0.1943469 secs
## [1] 27
## Time difference of 0.1942339 secs
## [1] 28
## Time difference of 0.1947808 secs
## [1] 29
## Time difference of 0.1945148 secs
## [1] 30
## Time difference of 0.1937914 secs
## [1] 31
## Time difference of 0.1941602 secs
## [1] 32
## Time difference of 0.1933351 secs
## [1] 33
## Time difference of 0.1932025 secs
## [1] 34
## Time difference of 0.1930056 secs
## [1] 35
## Time difference of 0.1939437 secs
## [1] 36
## Time difference of 0.1934345 secs
## [1] 37
## Time difference of 0.1934783 secs
## [1] 38
## Time difference of 0.1935275 secs
## [1] 39
## Time difference of 0.1941283 secs
## [1] 40
## Time difference of 0.1938252 secs
## [1] 41
## Time difference of 0.1941159 secs
## [1] 42
## Time difference of 0.2017584 secs
## [1] 43
## Time difference of 0.1943264 secs
## [1] 44
## Time difference of 0.1936121 secs
## [1] 45
## Time difference of 0.1939304 secs
## [1] 46
## Time difference of 0.1936915 secs
## [1] 47
## Time difference of 0.1942821 secs
## [1] 48
## Time difference of 0.1937149 secs
## [1] 49
## Time difference of 0.1943805 secs
## [1] 50
## Time difference of 0.1931441 secs
## [1] 51
## Time difference of 0.1932924 secs
## [1] 52
## Time difference of 0.1936314 secs
## [1] 53
## Time difference of 0.1957285 secs
## [1] 54
## Time difference of 0.1948321 secs
## [1] 55
## Time difference of 0.1930025 secs
## [1] 56
## Time difference of 0.1939728 secs
## [1] 57
## Time difference of 0.1933589 secs
## [1] 58
## Time difference of 0.1927891 secs
## [1] 59
## Time difference of 0.1931746 secs
## [1] 60
## Time difference of 0.1926086 secs
## [1] 61
## Time difference of 0.2013631 secs
## [1] 62
## Time difference of 0.1948898 secs
## [1] 63
## Time difference of 0.1944175 secs
## [1] 64
## Time difference of 0.1943603 secs
## [1] 65
## Time difference of 0.1935189 secs
## [1] 66
## Time difference of 0.1945031 secs
## [1] 67
## Time difference of 0.1923978 secs
## [1] 68
## Time difference of 0.1942472 secs
## [1] 69
## Time difference of 0.1937041 secs
## [1] 70
## Time difference of 0.1928911 secs
## [1] 71
## Time difference of 0.1940265 secs
## [1] 72
## Time difference of 0.1946826 secs
## [1] 73
## Time difference of 0.1940122 secs
## [1] 74
## Time difference of 0.1930516 secs
## [1] 75
## Time difference of 0.194885 secs
## [1] 76
## Time difference of 0.1948833 secs
## [1] 77
## Time difference of 0.1938276 secs
## [1] 78
## Time difference of 0.1946733 secs
## [1] 79
## Time difference of 0.1936193 secs
## [1] 80
## Time difference of 0.193295 secs
## [1] 81
## Time difference of 0.2034726 secs
## [1] 82
## Time difference of 0.1935394 secs
## [1] 83
## Time difference of 0.1946504 secs
## [1] 84
## Time difference of 0.1941459 secs
## [1] 85
## Time difference of 0.1939492 secs
## [1] 86
## Time difference of 0.1946206 secs
## [1] 87
## Time difference of 0.1946251 secs
## [1] 88
## Time difference of 0.1939914 secs
## [1] 89
## Time difference of 0.1941123 secs
## [1] 90
## Time difference of 0.1950834 secs
## [1] 91
## Time difference of 0.1945875 secs
## [1] 92
## Time difference of 0.1940007 secs
## [1] 93
## Time difference of 0.1941025 secs
## [1] 94
## Time difference of 0.1952741 secs
## [1] 95
## Time difference of 0.1939723 secs
## [1] 96
## Time difference of 0.193053 secs
## [1] 97
## Time difference of 0.1943495 secs
## [1] 98
## Time difference of 0.1947656 secs
## [1] 99
## Time difference of 0.193516 secs
## [1] 100
## Time difference of 0.2017739 secs
## [1] "Build return matrix"
## [1] "Done"
## Time difference of 19.56191 secs
## Formating the data as long ##
long = ensemble_result %>%
as.data.frame() %>%
tibble::rownames_to_column("model") %>%
pivot_longer(2:length(.),names_to = "row_id")
## Extracting top metabolites ##
top = long %>%
group_by(row_id) %>%
dplyr::summarise(sum = sum(value)) %>%
dplyr::arrange(-sum)
## Pulling top N metabolites ##
topn = top[1:10,] %>%
pull(row_id)
topn_out = tibble(row_id = topn )
write_csv(topn_out,"01_r_processed_data/top10_live_v_dead_metabolomics_biomarkers.csv")
data = normalized_data %>%
inner_join(gnps_annotations,by = "row_id") %>%
filter(!is.na(death_during_admission)) %>%
filter(row_id %in% topn) %>%
mutate(condition = as.character(death_during_admission))
f7d = make_roc_metabolomics(data,ncol = 5)ggsave("02_figures/f7_prediction_of_clinical_outcome_metabolomics/f7d.pdf",f7d,height = 12,width = 32)This is generated in draw.io
Need to get info from Dominic before I can do anything with this.
# Reading in data
protein_sum = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
topn = read_csv("01_r_processed_data/top10_infected_v_healthy_proteomics_biomarkers.csv") %>% pull(Gene)
# Processing data
ROC_data = dplyr::filter(protein_sum,Gene %in% topn) #%>%
#dplyr::mutate(Condition = case_when(Condition %in% c("Faecalis","Faecium") ~ "infected",
# TRUE ~ Condition))
source("functions.R")
s3a = make_violin_plot_proteomics(ROC_data,topn)s3a## TableGrob (2 x 5) "arrange": 10 grobs
## z cells name grob
## SERPINA3 1 (1-1,1-1) arrange gtable[layout]
## LRG1 2 (1-1,2-2) arrange gtable[layout]
## CLEC3B 3 (1-1,3-3) arrange gtable[layout]
## TF 4 (1-1,4-4) arrange gtable[layout]
## APOA2 5 (1-1,5-5) arrange gtable[layout]
## LBP 6 (2-2,1-1) arrange gtable[layout]
## ENO1 7 (2-2,2-2) arrange gtable[layout]
## GSN 8 (2-2,3-3) arrange gtable[layout]
## SERPINA5 9 (2-2,4-4) arrange gtable[layout]
## FGA 10 (2-2,5-5) arrange gtable[layout]
ggsave("03_supplemental/s3_biomarkers_violin/s3a.pdf",s3a, height = 6 ,width=16)## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
topn = read_csv("01_r_processed_data/top10_infected_v_healthy_metabolomics_biomarkers.csv") %>% pull(row_id)
ROC_data = dplyr::filter(normalized_data,row_id %in% topn) %>%
dplyr::arrange(factor(row_id,topn)) %>%
inner_join(gnps_annotations,by = c("row_id" = "row_id"))
source("functions.R")
s3b = make_violin_plot_metabolomics(ROC_data,topn)ggsave("03_supplemental/s3_biomarkers_violin/s3b.pdf",s3b,height = 6 ,width=16)# Reading in data
protein_sum = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
topn = read_csv("01_r_processed_data/top10_faecalis_v_faecium_proteomics_biomarkers.csv") %>% pull(Gene)
# Processing data
ROC_data = dplyr::filter(protein_sum,Gene %in% topn)
source("functions.R")
s3c = make_violin_plot_proteomics(ROC_data,topn)s3c## TableGrob (2 x 5) "arrange": 10 grobs
## z cells name grob
## IGKV2-30 1 (1-1,1-1) arrange gtable[layout]
## PCYOX1 2 (1-1,2-2) arrange gtable[layout]
## SERPINC1 3 (1-1,3-3) arrange gtable[layout]
## APOC3 4 (1-1,4-4) arrange gtable[layout]
## PROC 5 (1-1,5-5) arrange gtable[layout]
## IGHV6-1 6 (2-2,1-1) arrange gtable[layout]
## RBP4 7 (2-2,2-2) arrange gtable[layout]
## IGHV3-7 8 (2-2,3-3) arrange gtable[layout]
## IGLV1-47 9 (2-2,4-4) arrange gtable[layout]
## AZGP1 10 (2-2,5-5) arrange gtable[layout]
ggsave("03_supplemental/s3_biomarkers_violin/s3c.pdf",s3c,height = 6 ,width=16)## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
topn = read_csv("01_r_processed_data/top10_faecalis_v_faecium_metabolomics_biomarkers.csv") %>% pull(row_id)
ROC_data = dplyr::filter(normalized_data,row_id %in% topn) %>%
dplyr::arrange(factor(row_id,topn)) %>%
inner_join(gnps_annotations,by = c("row_id" = "row_id"))
s3d = make_violin_plot_metabolomics(ROC_data,topn)ggsave("03_supplemental/s3_biomarkers_violin/s3d.pdf",s3d,height = 6 ,width=16)clinical_md = read_csv("00_r_input_data/clinical_metadata.csv")
protein_sum = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
# Defining the input metadata. Need to chose which columns to keep here.
md_assesment_data= protein_sum %>%
filter(Condition != "Healthy") %>%
inner_join(clinical_md, by = c("BioReplicate" = "sample_id")) %>%
select(sample_id,Gene,bmi,bacteremia_duration,charleson_index,congestive_heart_failure,
death_during_admission,history_of_myocardial_infarction,transplant_type,pathogen,icu_admission,
hypotension,gender,race,smoking,polymicrobial_bacteremia,neutropenia_anc_1500,leukemia_or_lymphoma,Abundance) %>%
ungroup()
topn = read_csv("01_r_processed_data/top10_faecalis_v_faecium_proteomics_biomarkers.csv") %>%
pull(Gene)
source("functions.R")
stat = metadata_assesment(md_assesment_data,
metric = Gene,
sample_id = sample_id,
features = topn,
outcome = Abundance,
outcome_c = "Abundance")
stat_final = rstatix::adjust_pvalue(stat,"p",output.col = "p.adj",method = "fdr")
## Plotting ###
s4a = stat_final %>%
ggplot(aes(reorder(variable,p.adj),-log10(p.adj)))+
geom_col()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed", color = "red")+
coord_flip()+
facet_wrap(~Gene,nrow = 2) +
xlab("metadata_field")
s4aggsave("03_supplemental/s4_biomarkers_clinical_metadata_assessment/s4a.pdf",s4a,height = 8, width = 16)metabolomics_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
m = inner_join(metabolomics_data,gnps_annotations,by = "row_id")
# Defining the input metadata. Need to chose which columns to keep here.
md_assesment_data= m %>%
filter(condition != "healthy") %>%
select(sample_id,row_id,bmi,bacteremia_duration,charleson_index,congestive_heart_failure,
death_during_admission,history_of_myocardial_infarction,transplant_type,pathogen,icu_admission,
hypotension,gender,race,smoking,polymicrobial_bacteremia,neutropenia_anc_1500,leukemia_or_lymphoma,norm_value) %>%
ungroup()
topn = read_csv("01_r_processed_data/top10_faecalis_v_faecium_metabolomics_biomarkers.csv") %>%
pull(row_id)
source("functions.R")
stat = metadata_assesment(md_assesment_data,
metric = row_id,
sample_id = sample_id,
features = topn,
outcome = norm_value,
outcome_c = "norm_value")
stat_final = rstatix::adjust_pvalue(stat,"p",output.col = "p.adj",method = "fdr")
## Plotting ###
s4b = stat_final %>%
inner_join(gnps_annotations,by = "row_id") %>%
ggplot(aes(reorder(variable,p.adj),-log10(p.adj)))+
geom_col()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed", color = "red")+
coord_flip()+
facet_wrap(~compound_name_cleaned,nrow = 2) +
xlab("metadata_field")
s4bggsave("03_supplemental/s4_biomarkers_clinical_metadata_assessment/s4b.pdf",s4b,height = 8, width = 16)clinical_md = read_csv("00_r_input_data/clinical_metadata.csv")
protein_sum = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
# Defining the input metadata. Need to chose which columns to keep here.
md_assesment_data= protein_sum %>%
inner_join(clinical_md, by = c("BioReplicate" = "sample_id")) %>%
filter(!is.na(death_during_admission)) %>%
select(sample_id,Gene,bmi,bacteremia_duration,charleson_index,congestive_heart_failure,
death_during_admission,history_of_myocardial_infarction,transplant_type,pathogen,icu_admission,
hypotension,gender,race,smoking,polymicrobial_bacteremia,neutropenia_anc_1500,leukemia_or_lymphoma,Abundance) %>%
ungroup()
topn = read_csv("01_r_processed_data/top10_live_v_dead_proteomics_biomarkers.csv") %>%
pull(Gene)
source("functions.R")
stat = metadata_assesment(md_assesment_data,
metric = Gene,
sample_id = sample_id,
features = topn,
outcome = Abundance,
outcome_c = "Abundance")
stat_final = rstatix::adjust_pvalue(stat,"p",output.col = "p.adj",method = "fdr")
## Plotting ###
s4c = stat_final %>%
ggplot(aes(reorder(variable,p.adj),-log10(p.adj)))+
geom_col()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed", color = "red")+
coord_flip()+
facet_wrap(~Gene,nrow = 2) +
xlab("metadata_field")
s4cggsave("03_supplemental/s4_biomarkers_clinical_metadata_assessment/s4c.pdf",s4a,height = 8, width = 16)metabolomics_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
m = inner_join(metabolomics_data,gnps_annotations,by = "row_id")
# Defining the input metadata. Need to chose which columns to keep here.
md_assesment_data= m %>%
filter(!is.na(death_during_admission)) %>%
select(sample_id,row_id,bmi,bacteremia_duration,charleson_index,congestive_heart_failure,
death_during_admission,history_of_myocardial_infarction,transplant_type,pathogen,icu_admission,
hypotension,gender,race,smoking,polymicrobial_bacteremia,neutropenia_anc_1500,leukemia_or_lymphoma,norm_value) %>%
ungroup()
topn = read_csv("01_r_processed_data/top10_live_v_dead_metabolomics_biomarkers.csv") %>%
pull(row_id)
source("functions.R")
stat = metadata_assesment(md_assesment_data,
metric = row_id,
sample_id = sample_id,
features = topn,
outcome = norm_value,
outcome_c = "norm_value")
stat_final = rstatix::adjust_pvalue(stat,"p",output.col = "p.adj",method = "fdr")
## Plotting ###
s4d = stat_final %>%
inner_join(gnps_annotations,by = "row_id") %>%
ggplot(aes(reorder(variable,p.adj),-log10(p.adj)))+
geom_col()+
geom_hline(yintercept = -log10(0.05),linetype = "dashed", color = "red")+
coord_flip()+
facet_wrap(~compound_name_cleaned,nrow = 2) +
xlab("metadata_field")
s4dggsave("03_supplemental/s4_biomarkers_clinical_metadata_assessment/s4b.pdf",s4b,height = 8, width = 16)protein_sum = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
anot = read_csv("00_r_input_data/all_protein_annotations.csv")
sum_annotated = left_join(protein_sum,anot,by = c("uniprotid" = "Entry"))
ig = sum_annotated %>%
filter(grepl(".*Immunoglobulin.*",Protein.names)) %>%
filter(!grepl(".*receptor.*",Protein.names))
stat = ig %>%
rstatix::t_test(Abundance ~ Condition)
s5 = ig %>%
ggplot(aes(Condition,Abundance))+
geom_violin()+
geom_boxplot()+
ggpubr::stat_pvalue_manual(stat,
y.position = 22,
step.increase = 0.1)
ggsave("03_supplemental/s5_immunoglobulin_level/s5.pdf",s5)Network-based Cytokine Inference Knowledge-based networks were used to infer the relative contributions of major cytokines on the observed proteomic alterations. First, a list of cytokines was manually curated (TGFb, TNF, IFN, IL1-40, CXCL1-16, CCL1-27) and submitted with the significantly altered proteins (uncorrected ANOVA p < 0.05) to the STRING-db tool (all active interaction sources, interaction score > 0.4). The network was exported into a simple tabular output and cytokines were profiled for their enrichment in mortality clusters (pMortality+, pMortality and pMortality) using the following approach. First, cytokines were filtered to have at least five connections to mortality networks. Then, the proportion of each cytokine within the mortality clusters (ie. number of cytokine-mortality cluster connections relative to the total connections within mortality clusters) was compared to the proportion of each cytokine in the entire dataset (ie. number of cytokine-protein connections relative to total connections within the entire dataset). Significance was determined using Chi-square tests and a fold-change was calculated by dividing the proportion of cytokine-mortality connections by the proportion of cytokine connections within the total dataset. A final enrichment score was calculated by multiplying the -log10(p value) from the Chisquare test by the log2(fold-change of enrichment). By including the total number of connections that each cytokine had in the entire dataset as a background, any enrichment bias due to higher annotation rates for popular cytokines in STRING-db is controlled for.
### Loading data ###
faecalis_v_faecium = readr::read_csv("01_r_processed_data/faecalis_v_faecium.csv")
# Name Mapping
annotations = read_csv("00_r_input_data/gene_uniprot_annotations.csv")
#changing the names to uniprotid
faecalis_v_faecium = left_join(faecalis_v_faecium,annotations,by = c("uniprotid"))
# reading in the curated cytokine list
cytokine_list = read_csv("00_r_input_data/jacobs_cytokine_list.csv",skip = 1)
### Cytokine Network generated for all proteins detected in the study ###
background = faecalis_v_faecium %>%
pull(gene_name)
#exporting file that we can use with the stringdb web tool
#https://www.string-db.org/cgi/input?sessionId=bjuiDWoZnTQb&input_page_show_search=on
all_proteins_and_cytokines = c(background,cytokine_list$cytokines)
write_lines(all_proteins_and_cytokines,"01_r_processed_data/cytokine_inference/00_all_proteins_and_cytokines.csv")
# Results were exported from the stringdb web tool.
# Settings were set as follows: minimum required interaction score = 0.400 (medium), active interaction sources = all (textmining, neighborhood, experiments, gene fusions, databases, co-occurenece, co-expression)
# Gene names that did not find a map were disregarded.
# Was exported as tabular text output with reciprocal edges
# Reading in results input into string db using faecalis_out proteins
string_db_results = read_tsv("01_r_processed_data/cytokine_inference/01_all_proteins_and_cytokines_string_db_out.tsv") %>%
#only keeping connections that are between a protein identified and cytokines
filter(`#node1` %in% background & `node2` %in% cytokine_list$cytokines)
write_csv(string_db_results,"01_r_processed_data/cytokine_inference/02_all_proteins_and_cytokines_string_db_out_filtered.csv")
# counting the number of times a cytokine interacted with a detected protein
n_background_proteins = length(background)
background_cytokines = string_db_results %>%
group_by(node2) %>%
summarise(n = n()) %>%
mutate(condition = "background_present")
write_lines(cytokine_list$cytokines,"01_r_processed_data/cytokine_inference/cytokine_node_list")This figure is generated in cytoscape using 02_all_proteins_and_cytokines_string_db_out_filtered.csv as the input network. Biopax style was used. Labels were removed. Background was changed to white and the edges were changed to black with a transparency of 75. Nodes in node 2 were selected using the cytokine_node_list file and their size (40), color (dark red) and the shape (diamond) was changed to distinguish the cytokines from other nodes.
4 cytoscape diagrams were created using the initial plot as a template in order to highlight the protein nodes by the protein belonging to each condition. Keeping with the scheme of the paper, healthy samples were colored yellow, faecalis infected samples were colored navy, and faecium colored samples were colored turquoise.
This figure was created in biorender.
## Doing this for Infected vs Background ##
### Now lets try doing this in Infected vs Healthy ###
infected_vs_healthy = read_csv("01_r_processed_data/Infected_vs_Healthy.csv")
# Name Mapping
annotations = read_csv("00_r_input_data/gene_uniprot_annotations.csv")
#changing the names to uniprotid
infected_vs_healthy = left_join(infected_vs_healthy,annotations,by = c("uniprotid"))
# figuring otu what genes were up in infected
up_in_infected = infected_vs_healthy %>%
filter(log2FC>0) %>%
filter(adj.pvalue <= 0.05) %>%
arrange(adj.pvalue) %>%
pull(gene_name)
n_infected_proteins = length(up_in_infected)
infected_cytokines = string_db_results %>%
filter(`#node1` %in% up_in_infected) %>%
group_by(node2) %>%
summarise(n= n()) %>%
mutate(condition = "infected_present")
background_infected = bind_rows(background_cytokines,infected_cytokines) %>%
pivot_wider(names_from = condition, values_from = n) %>%
mutate(infected_present = replace_na(infected_present,0),
background_absent = n_background_proteins- background_present,
infected_absent = n_infected_proteins - infected_present)
# Making a contingency table
out = data.frame()
for(i in unique(background_infected$node2)){
contingency_table_present = background_infected %>%
filter(node2 == i) %>%
select(infected_present,background_present) %>%
t() %>%
as.data.frame() %>%
rename(present = V1)
contingency_table_absent =background_infected %>%
filter(node2 == i) %>%
select(infected_absent,background_absent) %>%
t() %>%
as.data.frame() %>%
rename(absent = V1)
contingency_table = bind_cols(contingency_table_present,contingency_table_absent) %>%
as.matrix()
stat = rstatix::fisher_test(contingency_table) %>%
mutate(cytokine = i )
out = rbind(stat,out)
}
corrected_pvalues = out %>%
rstatix::adjust_pvalue(method = "fdr")
# Now calculating fold change
log2fc = background_infected %>%
select(node2,
background_present,
infected_present) %>%
mutate(n_background_proteins = n_background_proteins,
n_infected_proteins = n_infected_proteins,
proportion_infected = infected_present/n_infected_proteins,
proportion_background = background_present/n_background_proteins,
log2fc = log2(proportion_infected/proportion_background)) %>%
select(node2,log2fc,proportion_in_condition = proportion_infected,
proportion_in_background = proportion_background)
# all metrics
infected_metrics = corrected_pvalues %>%
inner_join(log2fc,by = c("cytokine" ="node2")) %>%
mutate(enrichment_score = -log10(p.adj) * log2fc) %>%
mutate(condition = "infected")
write_lines(up_in_infected,"01_r_processed_data/cytokine_inference/infected_protein_list")### Now lets try doing this in Infected vs Healthy ###
infected_vs_healthy = read_csv("01_r_processed_data/Infected_vs_Healthy.csv")
# Name Mapping
annotations = read_csv("00_r_input_data/gene_uniprot_annotations.csv")
#changing the names to uniprotid
infected_vs_healthy = left_join(infected_vs_healthy,annotations,by = c("uniprotid"))
# up in healthy
up_in_healthy = infected_vs_healthy %>%
filter(log2FC<0) %>%
filter(adj.pvalue <= 0.05) %>%
arrange(adj.pvalue) %>%
pull(gene_name)
n_healthy_proteins = length(up_in_healthy)
healthy_cytokines = string_db_results %>%
filter(`#node1` %in% up_in_healthy) %>%
group_by(node2) %>%
summarise(n= n()) %>%
mutate(condition = "healthy_present")
background_healthy = bind_rows(background_cytokines,healthy_cytokines) %>%
pivot_wider(names_from = condition, values_from = n) %>%
mutate(healthy_present = replace_na(healthy_present,0),
background_absent = n_background_proteins- background_present,
healthy_absent = n_healthy_proteins - healthy_present)
# Making a contingency table
out = data.frame()
for(i in unique(background_healthy$node2)){
contingency_table_present = background_healthy %>%
filter(node2 == i) %>%
select(healthy_present,background_present) %>%
t() %>%
as.data.frame() %>%
rename(present = V1)
contingency_table_absent =background_healthy %>%
filter(node2 == i) %>%
select(healthy_absent,background_absent) %>%
t() %>%
as.data.frame() %>%
rename(absent = V1)
contingency_table = bind_cols(contingency_table_present,contingency_table_absent) %>%
as.matrix()
stat = rstatix::fisher_test(contingency_table) %>%
mutate(cytokine = i )
out = rbind(stat,out)
}
corrected_pvalues = out %>%
rstatix::adjust_pvalue(method = "fdr")
# Now calculating fold change
log2fc = background_healthy %>%
select(node2,
background_present,
healthy_present) %>%
mutate(n_background_proteins = n_background_proteins,
n_healthy_proteins = n_healthy_proteins,
proportion_healthy = healthy_present/n_healthy_proteins,
proportion_background = background_present/n_background_proteins,
log2fc = log2(proportion_healthy/proportion_background)) %>%
select(node2,log2fc,proportion_in_condition = proportion_healthy,
proportion_in_background = proportion_background)
# all metrics
healthy_metrics = corrected_pvalues %>%
inner_join(log2fc,by = c("cytokine" ="node2")) %>%
mutate(enrichment_score = -log10(p.adj) * log2fc,
condition = "healthy")
write_lines(up_in_healthy,"01_r_processed_data/cytokine_inference/healthy_protein_list")### Now lets try doing this in Facalis vs Faecium ###
faecalis_v_faecium = readr::read_csv("01_r_processed_data/faecalis_v_faecium.csv")
# Name Mapping
annotations = read_csv("00_r_input_data/gene_uniprot_annotations.csv")
#changing the names to uniprotid
faecalis_v_faecium = left_join(faecalis_v_faecium,annotations,by = c("uniprotid"))
# up in faecalis
up_in_faecalis = faecalis_v_faecium %>%
filter(log2FC>0) %>%
filter(adj.pvalue <= 0.05) %>%
arrange(adj.pvalue) %>%
pull(gene_name)
n_faecalis_proteins = length(up_in_faecalis)
faecalis_cytokines = string_db_results %>%
filter(`#node1` %in% up_in_faecalis) %>%
group_by(node2) %>%
summarise(n= n()) %>%
mutate(condition = "faecalis_present")
background_faecalis = bind_rows(background_cytokines,faecalis_cytokines) %>%
pivot_wider(names_from = condition, values_from = n) %>%
mutate(faecalis_present = replace_na(faecalis_present,0),
background_absent = n_background_proteins- background_present,
faecalis_absent = n_faecalis_proteins - faecalis_present)
# Making a contingency table
out = data.frame()
for(i in unique(background_faecalis$node2)){
contingency_table_present = background_faecalis %>%
filter(node2 == i) %>%
select(faecalis_present,background_present) %>%
t() %>%
as.data.frame() %>%
rename(present = V1)
contingency_table_absent = background_faecalis %>%
filter(node2 == i) %>%
select(faecalis_absent,background_absent) %>%
t() %>%
as.data.frame() %>%
rename(absent = V1)
contingency_table = bind_cols(contingency_table_present,contingency_table_absent) %>%
as.matrix()
stat = rstatix::fisher_test(contingency_table) %>%
mutate(cytokine = i )
out = rbind(stat,out)
}
corrected_pvalues = out %>%
rstatix::adjust_pvalue(method = "fdr")
# Now calculating fold change
log2fc = background_faecalis %>%
select(node2,
background_present,
faecalis_present) %>%
mutate(n_background_proteins = n_background_proteins,
n_faecalis_proteins = n_faecalis_proteins,
proportion_faecalis = faecalis_present/n_faecalis_proteins,
proportion_background = background_present/n_background_proteins,
log2fc = log2(proportion_faecalis/proportion_background)) %>%
select(node2,log2fc,proportion_in_condition = proportion_faecalis,
proportion_in_background = proportion_background)
# all metrics
faecalis_metrics = corrected_pvalues %>%
inner_join(log2fc,by = c("cytokine" ="node2")) %>%
mutate(enrichment_score = -log10(p.adj) * log2fc,
condition = "faecalis")
write_lines(up_in_faecalis,"01_r_processed_data/cytokine_inference/faecalis_protein_list")### Now lets try doing this in Facalis vs Faecium ###
faecalis_v_faecium = readr::read_csv("01_r_processed_data/faecalis_v_faecium.csv")
# Name Mapping
annotations = read_csv("00_r_input_data/gene_uniprot_annotations.csv")
#changing the names to uniprotid
faecalis_v_faecium = left_join(faecalis_v_faecium,annotations,by = c("uniprotid"))
# up in faceium
up_in_faecium = faecalis_v_faecium %>%
filter(log2FC<0) %>%
filter(adj.pvalue <= 0.05) %>%
arrange(adj.pvalue) %>%
pull(gene_name)
n_faecium_proteins = length(up_in_faecium)
faecium_cytokines = string_db_results %>%
filter(`#node1` %in% up_in_faecium) %>%
group_by(node2) %>%
summarise(n= n()) %>%
mutate(condition = "faecium_present")
background_faecium = bind_rows(background_cytokines,faecium_cytokines) %>%
pivot_wider(names_from = condition, values_from = n) %>%
mutate(faecium_present = replace_na(faecium_present,0),
background_absent = n_background_proteins- background_present,
faecium_absent = n_faecium_proteins - faecium_present)
# Making a contingency table
out = data.frame()
for(i in unique(background_faecalis$node2)){
contingency_table_present = background_faecium %>%
filter(node2 == i) %>%
select(faecium_present,background_present) %>%
t() %>%
as.data.frame() %>%
rename(present = V1)
contingency_table_absent = background_faecium %>%
filter(node2 == i) %>%
select(faecium_absent,background_absent) %>%
t() %>%
as.data.frame() %>%
rename(absent = V1)
contingency_table = bind_cols(contingency_table_present,contingency_table_absent) %>%
as.matrix()
stat = rstatix::fisher_test(contingency_table) %>%
mutate(cytokine = i )
out = rbind(stat,out)
}
corrected_pvalues = out %>%
rstatix::adjust_pvalue(method = "fdr")
# Now calculating fold change
log2fc = background_faecium %>%
select(node2,
background_present,
faecium_present) %>%
mutate(n_background_proteins = n_background_proteins,
n_faecium_proteins = n_faecium_proteins,
proportion_faecium= faecium_present/n_faecium_proteins,
proportion_background = background_present/n_background_proteins,
log2fc = log2(proportion_faecium/proportion_background)) %>%
select(node2,log2fc,proportion_in_condition = proportion_faecium,
proportion_in_background = proportion_background)
# all metrics
faecium_metrics = corrected_pvalues %>%
inner_join(log2fc,by = c("cytokine" ="node2")) %>%
mutate(enrichment_score = -log10(p.adj) * log2fc,
condition = "faecium")
write_lines(up_in_faecium,"01_r_processed_data/cytokine_inference/faecium_protein_list")all_metrics = dplyr::bind_rows(infected_metrics,healthy_metrics,faecalis_metrics,faecium_metrics) %>%
mutate(direction_of_change = case_when(log2fc > 0 ~ "increased",
TRUE ~ "decreased"),
)
all_metrics$condition = factor(all_metrics$condition, levels=c("infected","healthy","faecalis","faecium"))
s6d = all_metrics %>%
ggplot(aes(reorder(cytokine, -p.adj),p.adj,color = condition,size = direction_of_change))+
geom_point()+
coord_flip()+
geom_hline(yintercept = 0.05,linetype = "dashed",color = "red")+
theme(legend.position = "bottom",axis.text.x = element_text(angle = 90))+
xlab("Cytokine")+
scale_colour_manual(values = c("#FFA500","#FDE725FF","#440154FF","#2A788EFF"))
s6dggsave("03_supplemental/s6_cytokine_inference/s6d.pdf",plot = s6d,width = 5,height = 12,units = "in")Figure plotted via cytoscape.
# Since we have
cytokines_in_infected_network = infected_metrics %>%
filter(p < 0.05) %>%
pull(cytokine)
#writing a file so we can look at the network further.
infected_cytokine_network = string_db_results %>%
filter(node2 %in% cytokines_in_infected_network,
`#node1` %in% up_in_infected)
write_csv(infected_cytokine_network,"01_r_processed_data/cytokine_inference/03_infected_protein_cytokine_network_filtered.csv")##S7- PTM analysis of proteomics data via open search.
open_search_results = read_tsv("00_r_input_data/global.modsummary.tsv") %>% select(Modification, `Mass Shift`,!contains("percent")) %>%
mutate(all_plexes_sum = (plex1_PSMs+plex2_PSMs+plex3_PSMs+plex4_PSMs+plex5_PSMs+plex6_PSMs+plex7_PSMs),
all_PSMs = sum(plex1_PSMs,plex2_PSMs,plex3_PSMs,plex4_PSMs,plex5_PSMs,plex6_PSMs,plex7_PSMs),
all_plexes_percent = all_plexes_sum/all_PSMs * 100,
mass_modification = paste0(`Mass Shift`,"-",Modification))
open_1_percent = open_search_results %>%
filter(all_plexes_percent >= 1)
p1 = open_1_percent %>%
ggplot(aes(reorder(mass_modification,all_plexes_percent),all_plexes_percent))+
geom_col()+
coord_flip()
p1 #total number of MS2 scans
ms2_scans = read_delim("../proteomics/gnps/clusterinfo/93fdf96f02e44389b503c53353712f6a.clusterinfo") %>%
group_by(`#Filename`) %>%
summarise(n = n())
27017 + 16665## [1] 43682
PTM Analysis - GNPS
## Reading in the PTM data from GNPS ##
ptm_data = read_delim("../metabolomics/ProteoSAFe-METABOLOMICS-SNETS-V2-3e73be4d-view_edges_delta_histogram/networkedges_selfloop/ae6d9d27f8684ad39b8e73b2d6f3da2e..selfloop",na = " ") %>%
mutate(DeltaMZ = abs(DeltaMZ))
# filtering to only include the data in group
data_f = ptm_data %>%
filter(#DeltaMZ >= min & DeltaMZ <= max,
DeltaMZ > 0)
# Extracting the annotations from ptm data
annotations = data_f %>%
group_by(EdgeAnnotation) %>%
summarise(mean = mean(DeltaMZ), n = n()) %>%
filter(n >= 10,
!is.na(EdgeAnnotation))
max_annotation_value = annotations %>%
pull(mean) %>%
max()
# plotting
p1 = data_f %>%
filter(DeltaMZ <= max_annotation_value + 20) %>%
#filter(group == group) %>%
ggplot(aes(DeltaMZ))+
geom_histogram(binwidth = 1)+
ggrepel::geom_text_repel(data = annotations, aes(x = mean, y = 1000,label = EdgeAnnotation, angle = 90),max.overlaps = 1000)
p1GNPS table
arranged_annotations = dplyr::arrange(annotations,-n)
DT::datatable(arranged_annotations)#Making a plot of the table
pdf("annotations.pdf",height = 4, width = 4)
gridExtra::grid.table(arranged_annotations)
dev.off()## png
## 2
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
known_indicatiors_of_bacterial_infection = c("CRP","SAA1")
f = dplyr::filter(pl,Gene %in% known_indicatiors_of_bacterial_infection) %>%
mutate(Condition = case_when(Condition == "Healthy" ~ "Healthy",
TRUE ~ "Infected"))
data = f
list = list()
for(i in known_indicatiors_of_bacterial_infection){
loop = filter(data, Gene == i)
mod = glm(as.factor(Condition) ~ Abundance,
data = data,
family = "binomial")
predicted <- predict(mod ,family = "binomial", loop, type="response")
#define object to plot
rocobj <- pROC::roc(loop$Condition, predicted)
auc = round(as.numeric(gsub(".*: ", "",rocobj$auc)),2)
#create ROC plot
p1 = pROC::ggroc(rocobj)+
ggtitle(paste0(i," AUC: ",auc))
list[[i]] = p1
}
s8a = do.call("grid.arrange", c(list, ncol = 2),) s8a## TableGrob (1 x 2) "arrange": 2 grobs
## z cells name grob
## CRP 1 (1-1,1-1) arrange gtable[layout]
## SAA1 2 (1-1,2-2) arrange gtable[layout]
ggsave("03_supplemental/s8_known_biomarkers_of_infection/s8a.pdf",s8a)#Loading and processing the data
#reading in the data
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv")
# C-reactive Protein = "P02741
# Calcitonin = P01258
# Serum Amyloid A = P0DJI8
known_indicatiors_of_bacterial_infection = c("CRP","SAA1")
f = dplyr::filter(pl,Gene %in% known_indicatiors_of_bacterial_infection)
library(rstatix)
library(ggpubr)
ttest <-
f %>%
group_by(Gene) %>%
rstatix::t_test(Abundance ~ Condition) %>%
add_y_position()
s8b = f %>%
ggplot(aes(Condition,Abundance))+
geom_violin()+
geom_point()+
ggpubr::stat_pvalue_manual(ttest)+
ylab("Abundance")+
facet_wrap(~Gene)+
ggtitle("Previously described biomarkers")
s8bggsave("03_supplemental/s8_known_biomarkers_of_infection/s8b.pdf",s8b)## Loading in all the data ###
md = read_csv("00_r_input_data/clinical_metadata.csv")
gene_data = read_csv("00_r_input_data/gene_presence_absence_long.csv")
## Combining data and metadata ##
gene_combined = inner_join(md,gene_data,by = "sample_id")
# Formating ##
faecalis = gene_combined %>%
filter(condition == "faecalis") %>%
select(sample_id,death_during_admission,Gene,value) %>%
pivot_wider(names_from = Gene,values_from = value)
## Getting into a format where we can do the clustering ##
faecalis_data = faecalis[,3:28290] %>%
#Remove columns that only contain 0s
select_if(colSums(.) != 0) %>%
as.data.frame()
## Fixing the row names so they match for the heatmap ##
rownames(faecalis_data) = faecalis$sample_id
faecalis_annotation = faecalis[,2] %>%
as.data.frame()
rownames(faecalis_annotation) = faecalis$sample_id
## Generating a dendrogram without a heatmap ##
library(dendextend)
d = dist(faecalis_data,method = "binary")
dend = as.dendrogram(hclust(d, method = "ward.D2"))
# let's add some color: Red = dead, black = okay
colors_to_use = c("#000000","#FF0000")[as.numeric(as.factor(faecalis_annotation$death_during_admission))]
#getting in the same order as dendrogram
colors_to_use = colors_to_use[order.dendrogram(dend)]
# Now we can color the brancehes
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd", 8)
plot(dend)pdf("03_supplemental/s9_sequencing_based_prediction_of_clinical_outcome/s9a.pdf")
plot(dend)
dev.off()## png
## 2
library(pheatmap)
## Generating a dendogram with a heatmap ##
faecalis_annotation = faecalis_annotation %>%
mutate(death_during_admission = as.character(death_during_admission)) %>%
select(death = death_during_admission)
#plotting with a heatmap
pheatmap(faecalis_data,
clustering_distance_rows = "binary",
clustering_method = "ward.D2",
annotation_row = faecalis_annotation,
labels_col=rep("",ncol(faecalis_data)))## Processing the data to only contain the faecium isolates ###
faecium = gene_combined %>%
filter(condition == "faecium") %>%
select(sample_id,death_during_admission,Gene,value) %>%
pivot_wider(names_from = Gene,values_from = value) %>%
# Removing weird sample that was found to be E.coli.
filter(sample_id != "S68")
# Processing to get rid of columns we dont need
faecium_data = faecium[,3:28290] %>%
#Remove columns that only contain 0s
select_if(colSums(.) != 0) %>%
as.data.frame()
# Changing the row names so we can plot
rownames(faecium_data) = faecium$sample_id
# Extracting the annotations
faecium_annotation = faecium[,2] %>%
as.data.frame()
rownames(faecium_annotation) = faecium$sample_id
## Generating a dendrogram without a heatmap ##
library(dendextend)
d = dist(faecium_data,method = "binary")
dend = as.dendrogram(hclust(d, method = "ward.D2"))
# let's add some color: Red = dead, black = okay
colors_to_use = c("#000000","#FF0000")[as.numeric(as.factor(faecium_annotation$death_during_admission))]
#getting in the same order as dendrogram
colors_to_use = colors_to_use[order.dendrogram(dend)]
# Now we can color the brancehes
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd",8)
plot(dend)pdf("03_supplemental/s9_sequencing_based_prediction_of_clinical_outcome/s9abpdf")
plot(dend)
dev.off()## png
## 2
#annotations for pheatmatp. Needs to be coded as a charachter becuause they have a bug in their code.
faecium_annotation = faecium_annotation %>%
mutate(death_during_admission = as.character(death_during_admission)) %>%
select(death = death_during_admission)
#plotting with a heatmap
pheatmap(faecium_data,
clustering_distance_rows = "binary",
clustering_method = "ward.D2",
annotation_row = faecium_annotation,
labels_col=rep("",ncol(faecium_data)))##reading in the data##
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv") %>%
rename(tmt_id = sample_id)
## Clinical metadata ##
map = readxl::read_excel("00_r_input_data/proteomics_sample_mapping.xlsx") %>%
mutate(tmt_id = paste0("plex",plex_number,".",TMT_channel)) %>%
select(tmt_id,sample_id = sample_name)
md = read_csv("00_r_input_data/clinical_metadata.csv")
md_final = inner_join(map,md,by = "sample_id")
clean_clinical = inner_join(pl,md_final, by = "tmt_id") %>%
# Don't need the healthy samples here
filter(Condition != "Healthy") %>%
# - messes up EFS for some reason, very annoying.
mutate(Gene = gsub("-","_",Gene)) %>%
select(sample_id,death_during_admission,Gene,Abundance) %>%
tidyr::pivot_wider(names_from = Gene,
values_from = Abundance) %>%
#Removing samples with NA.
filter(!is.na(death_during_admission)) %>%
dplyr::select_if(~ !any(is.na(.))) %>%
#needs to be a dataframe or EFS freaks out.
as.data.frame() %>%
select(-sample_id)
topn = read_csv("01_r_processed_data/top10_live_v_dead_proteomics_biomarkers.csv") %>%
pull(Gene)
data = inner_join(pl,md_final, by = "tmt_id") %>%
filter(Condition != "Healthy") %>%
mutate(Condition = death_during_admission) %>%
filter(!is.na(Condition))
s10a = make_violin_plot_proteomics(data,topn)ggsave("03_supplemental/s10_live_v_dead_biomarkers_violin/s10a.pdf",s10a,height = 6, width = 16)## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data/normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
topn = read_csv("01_r_processed_data/top10_live_v_dead_metabolomics_biomarkers.csv") %>%
pull(row_id)
data = normalized_data %>%
inner_join(gnps_annotations,by = "row_id") %>%
filter(!is.na(death_during_admission)) %>%
filter(row_id %in% topn) %>%
mutate(condition = as.character(death_during_admission))
s10b = make_violin_plot_metabolomics(data,topn,ncol = 5)ggsave("03_supplemental/s10_live_v_dead_biomarkers_violin/s10b.pdf",s10b,height = 16,width = 32)##Proteomics data ##
pl = read_csv("01_r_processed_data/quant_msstats_protein_level_data.csv") %>%
select(tmt_id = sample_id,everything())
## Clinical metadata ##
map = readxl::read_excel("00_r_input_data/proteomics_sample_mapping.xlsx") %>%
mutate(tmt_id = paste0("plex",plex_number,".",TMT_channel)) %>%
select(tmt_id,sample_id = sample_name)
md = read_csv("00_r_input_data/clinical_metadata.csv")
md_final = inner_join(map,md,by = "sample_id")
clean_clinical = inner_join(pl,md_final, by = "tmt_id")
clean_clinical_wide = clean_clinical %>%
select(sample_id,death_during_admission,Gene,Abundance) %>%
tidyr::pivot_wider(names_from = Gene,
values_from = Abundance) %>%
#filter(sample_id.y != "S49",
#sample_id.y != "S76") %>%
dplyr::select_if(~ !any(is.na(.))) %>%
#needs to be a dataframe or EFS freaks out.
as.data.frame()
## Metabolomics data
## Reading in the normalized data ##
normalized_data = read_csv("01_r_processed_data//normalized_metabolomics.csv")
gnps_annotations = read_csv("00_r_input_data/gnps_annotations.csv")
## Switching to the wide format ###
norm_wide = normalized_data %>%
select(-row_m_z,-row_retention_time,-value) %>%
tidyr::pivot_wider(names_from = row_id,values_from = norm_value)
## Looking for only columns that were annotated ##
gnps_and_in_norm = intersect(unique(gnps_annotations$row_id), colnames(norm_wide))
gnps_annotated_metabolites = norm_wide %>%
as.data.frame() %>%
select(sample_id,death_during_admission,gnps_and_in_norm) %>%
filter(!is.na(death_during_admission))
### Combining all data ###
all= inner_join(clean_clinical_wide,gnps_annotated_metabolites,by = "sample_id")
# Getting into a format where we can do the clustering ##
clust_data = all[,2:491] %>%
#Remove columns that only contain 0s
select_if(colSums(.) != 0) %>%
as.data.frame()
## Fixing the row names so they match for the heatmap ##
rownames(clust_data) = clust_data$sample_id
annotation = all[,2] %>%
as.data.frame()
## Generating a dendrogram without a heatmap ##
library(dendextend)
#scaling so that proteomics and metabolomics are in terms of same units
scaled = scale(clust_data)
d = dist(scaled)
dend = as.dendrogram(hclust(d, method = "ward.D2"))
# let's add some color: Red = dead, black = okay
colors_to_use = c("#000000","#FF0000")[as.numeric(as.factor(all$death_during_admission))]
#getting in the same order as dendrogram
colors_to_use = colors_to_use[order.dendrogram(dend)]
# Now we can color the brancehes
dend = branches_color(dend,col = colors_to_use,) %>%
set("branches_lwd", 8)
plot(dend)
dend %>% rect.dendrogram(k=2,
border = 8, lty = 5, lwd = 2)
xtab <- as.table(rbind(c(4, 8), c(13, 53)))
dimnames(xtab) <- list(
group = c("k1", "k2"),
death = c("yes", "no")
)
fisher_test(xtab)This figure was generated in Biorender.
https://www.tidymodels.org/start/case-study/
## Loading in all the data needed for the model ##
mapping = readxl::read_excel("../proteomics/sample_mapping.xlsx") %>%
mutate(sample_id = paste0("plex",plex_number,".",TMT_channel)) %>%
select(sample_id,sample_name)
proteomics = read_csv("../Analysis/quant_msstats_protein_level_data.csv") %>%
separate(Protein, c("orientation","uniprotid","locus"),sep ="\\|") %>%
inner_join(protein_annotations,by = c("uniprotid" = "Entry")) %>%
mutate(sample_id = paste0(Mixture,".",Channel)) %>%
inner_join(mapping,by = "sample_id") %>%
select(sample_name,uniprotid,Abundance) %>%
pivot_wider(names_from = uniprotid,values_from = Abundance)
gnps_annotations = read_csv("gnps_annotations.csv")
metabolomics = read_csv("../Manuscript/normalized_metabolomics.csv") %>%
select(sample_name = sample_id,condition,row_id,norm_value) %>%
# Only looking at the metabolites that we were able to identify
#filter(row_id %in% gnps_annotations$row_id) %>%
pivot_wider(names_from = row_id,values_from = norm_value)
all = inner_join(proteomics,metabolomics,by= "sample_name") %>%
select(sample_name,condition,everything()) %>%
filter(condition %in% c("faecalis","faecium"))
write_csv(all, "all_features_for_lasso_regression.csv")### Reading in data for the regression ###
all = read_csv("all_features_for_lasso_regression.csv")
## What values have less than 50% of the features?
missing_count = colSums(is.na(all)/nrow(all) * 100)
missing_g_50 = names(missing_count[missing_count>50])
## Removing features missing > 50% of values ##
all_50_missing_removed = all %>%
select(!contains(missing_g_50))
# Setting seed to make steps relying on randomness reproducible
set.seed(123)
## Imputing values using missranger package ##
library(missRanger)
all_na_impute = missRanger(all_50_missing_removed)
### Performing Lasso Regression ###
library(tidymodels)
# Splitting the original dataset
split = initial_split(all_na_impute, strata = condition)
# Putting all data in the training set. Did this because we don't have enough samples for classic training/ validation set.
train = training(split)
test = testing(split)
## Defining the Cross Validation Folds ##
folds = vfold_cv(train, v = 10)
## Defining a recipe. ##
recipe = recipe(condition ~ ., data = train) %>%
# Removing the ID column from consideration
update_role(sample_name, new_role = "ID") %>%
# Getting rid of all variables with zero variance (if they exist)
step_zv(all_numeric(), -all_outcomes()) %>%
# Normalizing data to work with the lasso regression.
step_normalize(all_numeric(), -all_outcomes())
# making sure strings are strings and not factors.
prep = recipe %>%
prep(strings_as_factors = FALSE)
# Defining the lasso model. Here the penalty is a tuned feature.
# Mixture specifies we should use L1 regularization, which equates to lasso regression.
lasso_spec = logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
# Making the recipe into a workflow.
wf = workflow() %>%
# Processes the data
add_recipe(recipe) %>%
# Adds the model
add_model(lasso_spec)
## Creating tuning grid ##
# 0.001 to 0.3 by steps of 0.01, as in Jacob's paper
lr_reg_grid = tibble(penalty = seq(from = 0.001, to = 0.3, by = 0.01))
## Tuning the model to find best penalty to use ##
lr_res = wf %>%
tune_grid(resample = folds,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
## Looking at how the penalty affects the performance of the model ##
lr_plot = lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
lr_plot
# Extracting the tuning parameter that worked the best #
lr_best = lr_res %>%
select_best("roc_auc")
## Updating workflow to contain results from tuning ##
lr_auc = lr_res %>%
collect_predictions(parameters = lr_best) %>%
roc_curve(condition, .pred_faecalis) %>%
mutate(model = "multi-omics")
autoplot(lr_auc)
##Doing this with test set ##
final_wf = wf %>%
finalize_workflow(lr_best)
final_fit <-
final_wf %>%
last_fit(split)
# Evaluating on the test set
final_fit %>%
collect_predictions() %>%
roc_curve(condition, .pred_faecalis) %>%
autoplot()### Performing Lasso Regression ###
library(tidymodels)
metabolomics_all_na_impute = all_na_impute %>%
select(starts_with("RID"))
set.seed(2)
# Splitting the original dataset
split = initial_split(all_na_impute, strata = condition)
#Putting all data as training for now
train = training(split,prop = 1)
# Cross validation folds
folds = vfold_cv(train, v = 10)
# Defining a recipe.
recipe = recipe(condition ~ ., data = train) %>%
update_role(sample_name, new_role = "ID") %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric(), -all_outcomes())
prep = recipe %>%
prep(strings_as_factors = FALSE)
# Defining the lasso model.
lasso_spec = logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
# Making the recipe into a workflow.
wf = workflow() %>%
add_recipe(recipe) %>%
add_model(lasso_spec)
# Training the model
lasso_fit = wf %>%
fit(data = train)
## Creating tuning grid ##
# 0.001 to 0.3 by steps of 0.01, as in Jacob's paper
lr_reg_grid = tibble(penalty = seq(from = 0.001, to = 0.3, by = 0.01))
## Getting the tuning result ##
lr_res = wf %>%
tune_grid(folds,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
## Looking at how the penalty affects the performance of the model ##
lr_plot = lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
lr_plot
#Extracting the tuning parameter that worked the best
lr_best = lr_res %>%
show_best("roc_auc", n = 1)
## Looking at the ROC Plot using lasso regression with all the predictors in the study ##
lr_auc = lr_res %>%
collect_predictions(parameters = lr_best) %>%
roc_curve(condition, .pred_faecalis) %>%
mutate(model = "Logistic Regression")
autoplot(lr_auc)# Bringing in the model from everything
rf_auc = rf_res %>%
collect_predictions(parameters = rf_best) %>%
roc_curve(children, .pred_children) %>%
mutate(model = "Random Forest")
# Bringing in the model from just metabolomics
# Bringing in the model from just proteomics
# Combining and plotting all models.
all_models = bind_rows(rf_auc, lr_auc) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = .6)md = readxl::read_excel("../proteomics_metadata.xlsx") %>%
group_by(condition) %>%
summarise(n = n())
mdproteomics_count = read_tsv("../proteomics/fragpipe_output_fixed_labels/tmt-report/abundance_gene_MD.tsv") %>%
nrow()
conserved_all_samples_proteomics_count = read_tsv("../proteomics/fragpipe_output_fixed_labels/tmt-report/abundance_gene_MD.tsv") %>%
select(6:length(.)) %>%
na.omit() %>%
nrow()
metabolite_features_count = read_csv("../metabolomics/mzmine_output/features_quant.csv") %>%
nrow()
## finding out how many annotated features there are.
gnps_annotations = read_csv("gnps_annotations.csv")
gnps_annotations_n = nrow(gnps_annotations)
## How many of these annotated features were found in every sample?
gnps_annotations_all = read_csv("unnormalized_metabolomics.csv") %>%
filter(row_id %in% gnps_annotations$row_id) %>%
filter(value > 0 ) %>%
group_by(row_id) %>%
summarise(n = n()) %>%
filter(n == 105) %>%
nrow()#reading in the data
pl = read_csv("../Analysis/quant_msstats_protein_level_data.csv")
clean = pl %>%
tidyr::separate(Protein, c("orientation","uniprotid","locus"),sep ="\\|") %>%
dplyr::group_by(Mixture,uniprotid,Channel,Condition) %>%
dplyr::summarise(sum = sum(Abundance))
infected_healthy_l = clean %>%
mutate(classlabels = case_when(Condition %in% c("Faecalis","Faecium") ~ 1,
TRUE ~ 0))
infected_healthy_w = infected_healthy_l %>%
pivot_wider(names_from = uniprotid,
values_from = sum) %>%
mutate(sampleid = paste0(Mixture,".",Channel)) %>%
ungroup() %>%
dplyr::select(-Mixture,-Channel,-Condition,-sampleid) %>%
select_if(~ !any(is.na(.))) %>%
as.data.frame()
#need to come up with some sort of way to combine all of the fractions and plexes
# The problem is that na.omit filters everything out.
set.seed(2)
ensemble_result = EFS::ensemble_fs(infected_healthy_w,1,cor_threshold = 0)
long = ensemble_result %>%
as.data.frame() %>%
tibble::rownames_to_column("model") %>%
pivot_longer(2:length(.))
top = long %>%
group_by(name) %>%
dplyr::summarise(sum = sum(value)) %>%
dplyr::arrange(-sum) %>%
mutate(rank = dense_rank(desc(sum))) %>%
select(rank,everything()) %>%
filter(name %in% known_indicatiors_of_bacterial_infection)protein_annotations = read_csv("../Analysis/all_protein_annotations.csv")
protein_sum = read_csv("../Analysis/quant_msstats_protein_level_data.csv") %>%
separate(Protein, c("orientation","uniprotid","locus"),sep ="\\|") %>%
inner_join(protein_annotations,by = c("uniprotid" = "Entry")) %>%
mutate(sample_id = paste0(Mixture,".",Channel))
topn = top %>%
pull(name)
ROC_data = dplyr::filter(protein_sum,uniprotid %in% topn) %>%
dplyr::mutate(Condition = case_when(Condition %in% c("Faecalis","Faecium") ~ "infected",
TRUE ~ Condition))
make_roc = function(ROC_data){
list = list()
for(i in topn){
data = filter(ROC_data, uniprotid == i)
mod = glm(as.factor(Condition) ~ Abundance,
data = data,
family = "binomial")
predicted <- predict(mod ,family = "binomial", data, type="response")
#define object to plot
rocobj <- pROC::roc(data$Condition, predicted)
auc = round(as.numeric(gsub(".*: ", "",rocobj$auc)),2)
#create ROC plot
p1 = pROC::ggroc(rocobj)+
ggtitle(paste0(i," AUC: ",auc))
list[[i]] = p1
}
return(list)
}
l = make_roc(ROC_data)
f2e = do.call(ggarrange,l)
f2eNote that we got rid of all the missing data for the metabolomics data as part of the normalization used.
# All combined.
all = read_csv("all_features_for_lasso_regression.csv") %>%
select(3:length(.))
mean(colSums(is.na(all))/nrow(all) * 100)
# Looking at just metabolomics.
metabolomics = all %>%
select(dplyr::starts_with("RID"))
# Sum of na/ total rows * 100 = percent NA.
mean(colSums(is.na(metabolomics))/nrow(metabolomics) * 100)
# Just proteomics.
proteomics = all %>%
select(!dplyr::starts_with("RID"))
# Sum of na/ total rows * 100 = percent NA.
mean(colSums(is.na(proteomics))/nrow(proteomics) * 100)annotations = read_delim("../proteomics/fragpipe_output_fixed_labels/tmt-report/abundance_gene_MD.tsv") %>%
select(gene_name = Index,uniprotid = ProteinID)
write_csv(annotations,"gene_uniprot_annotations.csv")